thermosphere: use barriers and caches *properly*. Cache code refactoring

- set/way cache ops create losses of coherency, do not broadcast and are only meant to be used on boot, period.

Cache ops by VA are **the only way** to do data cache maintenance.

Fix a bug where the L2 cache was evicted by each core. It shouldn't have.

- Cleaning dcache to PoU and invalidating icache to PoU, by VA is sufficient for self-modifying code

- Since we operate within a single cluster and don't do DMA, we almost always operate within the inner shareability domain

(commit untested on real hw)
This commit is contained in:
TuxSH 2020-01-15 02:42:07 +00:00
parent 1369697058
commit 72d1992eec
13 changed files with 234 additions and 300 deletions

View File

@ -1,18 +0,0 @@
#pragma once
#include "types.h"
void flush_dcache_all(void);
void invalidate_dcache_all(void);
void flush_dcache_range(const void *start, const void *end);
void invalidate_dcache_range(const void *start, const void *end);
void invalidate_icache_all_inner_shareable(void);
void invalidate_icache_all(void);
void set_memory_registers_enable_mmu(uintptr_t ttbr0, u64 tcr, u64 mair);
void set_memory_registers_enable_stage2(uintptr_t vttbr, u64 vtcr);
void reloadBreakpointRegs(size_t num);
void initWatchpointRegs(size_t num);

View File

@ -1,251 +0,0 @@
/*
* Copyright (c) 2018-2019 Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "asm_macros.s"
/* The following functions are taken/adapted from https://github.com/u-boot/u-boot/blob/master/arch/arm/cpu/armv8/cache.S */
/*
* (C) Copyright 2013
* David Feng <fenghua@phytium.com.cn>
*
* This file is based on sample code from ARMv8 ARM.
*
* SPDX-License-Identifier: GPL-2.0+
*/
/*
* void __asm_dcache_level(level)
*
* flush or invalidate one level cache.
*
* x0: cache level
* x1: 0 clean & invalidate, 1 invalidate only
* x2~x9: clobbered
*/
.section .text.__asm_dcache_level, "ax", %progbits
.type __asm_dcache_level, %function
__asm_dcache_level:
lsl x12, x0, #1
msr csselr_el1, x12 /* select cache level */
isb /* sync change of cssidr_el1 */
mrs x6, ccsidr_el1 /* read the new cssidr_el1 */
and x2, x6, #7 /* x2 <- log2(cache line size)-4 */
add x2, x2, #4 /* x2 <- log2(cache line size) */
mov x3, #0x3ff
and x3, x3, x6, lsr #3 /* x3 <- max number of #ways */
clz w5, w3 /* bit position of #ways */
mov x4, #0x7fff
and x4, x4, x6, lsr #13 /* x4 <- max number of #sets */
/* x12 <- cache level << 1 */
/* x2 <- line length offset */
/* x3 <- number of cache ways - 1 */
/* x4 <- number of cache sets - 1 */
/* x5 <- bit position of #ways */
loop_set:
mov x6, x3 /* x6 <- working copy of #ways */
loop_way:
lsl x7, x6, x5
orr x9, x12, x7 /* map way and level to cisw value */
lsl x7, x4, x2
orr x9, x9, x7 /* map set number to cisw value */
tbz w1, #0, 1f
dc isw, x9
b 2f
1: dc cisw, x9 /* clean & invalidate by set/way */
2: subs x6, x6, #1 /* decrement the way */
b.ge loop_way
subs x4, x4, #1 /* decrement the set */
b.ge loop_set
ret
/*
* void __asm_flush_dcache_all(int invalidate_only)
*
* x0: 0 clean & invalidate, 1 invalidate only
*
* flush or invalidate all data cache by SET/WAY.
*/
.section .text.__asm_dcache_all, "ax", %progbits
.type __asm_dcache_all, %function
__asm_dcache_all:
mov x1, x0
dsb sy
mrs x10, clidr_el1 /* read clidr_el1 */
lsr x11, x10, #24
and x11, x11, #0x7 /* x11 <- loc */
cbz x11, finished /* if loc is 0, exit */
mov x15, lr
mov x0, #0 /* start flush at cache level 0 */
/* x0 <- cache level */
/* x10 <- clidr_el1 */
/* x11 <- loc */
/* x15 <- return address */
loop_level:
lsl x12, x0, #1
add x12, x12, x0 /* x0 <- tripled cache level */
lsr x12, x10, x12
and x12, x12, #7 /* x12 <- cache type */
cmp x12, #2
b.lt skip /* skip if no cache or icache */
bl __asm_dcache_level /* x1 = 0 flush, 1 invalidate */
skip:
add x0, x0, #1 /* increment cache level */
cmp x11, x0
b.gt loop_level
mov x0, #0
msr csselr_el1, x0 /* restore csselr_el1 */
dsb sy
isb
mov lr, x15
finished:
ret
FUNCTION flush_dcache_all
mov x0, #0
b __asm_dcache_all
END_FUNCTION
FUNCTION invalidate_dcache_all
mov x0, #1
b __asm_dcache_all
END_FUNCTION
/*
* void __asm_flush_dcache_range(start, end) (renamed -> flush_dcache_range)
*
* clean & invalidate data cache in the range
*
* x0: start address
* x1: end address
*/
FUNCTION flush_dcache_range
mrs x3, ctr_el0
lsr x3, x3, #16
and x3, x3, #0xf
mov x2, #4
lsl x2, x2, x3 /* cache line size */
/* x2 <- minimal cache line size in cache system */
sub x3, x2, #1
bic x0, x0, x3
1: dc civac, x0 /* clean & invalidate data or unified cache */
add x0, x0, x2
cmp x0, x1
b.lo 1b
dsb sy
ret
END_FUNCTION
/*
* void __asm_invalidate_dcache_range(start, end) (-> invalidate_dcache_range)
*
* invalidate data cache in the range
*
* x0: start address
* x1: end address
*/
FUNCTION invalidate_dcache_range
mrs x3, ctr_el0
ubfm x3, x3, #16, #19
mov x2, #4
lsl x2, x2, x3 /* cache line size */
/* x2 <- minimal cache line size in cache system */
sub x3, x2, #1
bic x0, x0, x3
1: dc ivac, x0 /* invalidate data or unified cache */
add x0, x0, x2
cmp x0, x1
b.lo 1b
dsb sy
ret
END_FUNCTION
/*
* void __asm_invalidate_icache_all(void) (-> invalidate_icache_inner_shareable)
*
* invalidate all icache entries.
*/
FUNCTION invalidate_icache_all_inner_shareable
dsb ish
isb
ic ialluis
dsb ish
isb
ret
END_FUNCTION
FUNCTION invalidate_icache_all
dsb sy
isb
ic iallu
dsb sy
isb
ret
END_FUNCTION
FUNCTION set_memory_registers_enable_mmu
msr ttbr0_el2, x0
msr tcr_el2, x1
msr mair_el2, x2
dsb sy
isb
tlbi alle2
dsb sy
isb
// Enable MMU & enable caching
mrs x0, sctlr_el2
orr x0, x0, #1
orr x0, x0, #(1 << 2)
orr x0, x0, #(1 << 12)
msr sctlr_el2, x0
dsb sy
isb
ret
END_FUNCTION
FUNCTION set_memory_registers_enable_stage2
msr vttbr_el2, x0
msr vtcr_el2, x1
dsb sy
isb
// Flushes all stage 1&2 entries, EL1
tlbi alle1
dsb sy
isb
// Enable stage2
mrs x0, hcr_el2
orr x0, x0, #1
msr hcr_el2, x0
dsb sy
isb
ret
END_FUNCTION

View File

@ -19,7 +19,6 @@
#include "breakpoints_watchpoints_load.h"
#include "utils.h"
#include "sysreg.h"
#include "arm.h"
BreakpointManager g_breakpointManager = {0};
@ -49,7 +48,7 @@ static void commitAndBroadcastBreakpointHandler(void *p)
static inline void commitAndBroadcastBreakpoints(void)
{
__dmb_sy();
__dmb();
executeFunctionOnAllCores(commitAndBroadcastBreakpointHandler, NULL, true);
}

99
thermosphere/src/caches.c Normal file
View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2019 Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "caches.h"
#include "preprocessor.h"
#define DEFINE_CACHE_RANGE_FUNC(isn, name, cache, post)\
void name(const void *addr, size_t size)\
{\
u32 lineCacheSize = cacheGetSmallest##cache##CacheLineSize();\
uintptr_t begin = (uintptr_t)addr & ~(lineCacheSize - 1);\
uintptr_t end = ((uintptr_t)addr + size + lineCacheSize - 1) & ~(lineCacheSize - 1);\
for (uintptr_t pos = begin; pos < end; pos += lineCacheSize) {\
__asm__ __volatile__ (isn ", %0" :: "r"(pos) : "memory");\
}\
post;\
}
static inline ALINLINE void cacheSelectByLevel(bool instructionCache, u32 level)
{
u32 ibit = instructionCache ? 1 : 0;
u32 lbits = (level & 7) << 1;
SET_SYSREG(csselr_el1, lbits | ibit);
__isb();
}
static inline ALINLINE void cacheInvalidateDataCacheLevel(u32 level)
{
cacheSelectByLevel(false, level);
u32 ccsidr = (u32)GET_SYSREG(ccsidr_el1);
u32 numWays = 1 + ((ccsidr >> 3) & 0x3FF);
u32 numSets = 1 + ((ccsidr >> 13) & 0x7FFF);
u32 wayShift = __builtin_clz(numWays);
u32 setShift = (ccsidr & 7) + 4;
u32 lbits = (level & 7) << 1;
for (u32 way = 0; way <= numWays; way++) {
for (u32 set = 0; set <= numSets; set++) {
u64 val = ((u64)way << wayShift) | ((u64)set << setShift) | lbits;
__asm__ __volatile__ ("dc isw, %0" :: "r"(val) : "memory");
}
}
}
static inline ALINLINE void cacheInvalidateDataCacheLevels(u32 from, u32 to)
{
// Let's hope it doesn't generate a stack frame...
for (u32 level = from; level < to; level++) {
cacheInvalidateDataCacheLevel(level);
}
__dsb_sy();
__isb();
}
DEFINE_CACHE_RANGE_FUNC("dc civac", cacheCleanInvalidateDataCacheRange, Data, __dsb())
DEFINE_CACHE_RANGE_FUNC("dc cvau", cacheCleanDataCacheRangePoU, Data, __dsb())
DEFINE_CACHE_RANGE_FUNC("ic ivau", cacheInvalidateInstructionCacheRangePoU, Instruction, __dsb(); __isb())
void cacheHandleSelfModifyingCodePoU(const void *addr, size_t size)
{
// See docs for ctr_el0.{dic, idc}. It's unclear when these bits have been added, but they're
// RES0 if not implemented, so that's fine
u32 ctr = (u32)GET_SYSREG(ctr_el0);
if (!(ctr & BIT(28))) {
cacheCleanDataCacheRangePoU(addr, size);
}
if (!(ctr & BIT(29))) {
cacheInvalidateInstructionCacheRangePoU(addr, size);
}
}
void cacheClearSharedDataCachesOnBoot(void)
{
u32 clidr = (u32)GET_SYSREG(clidr_el1);
u32 louis = (clidr >> 21) & 7;
u32 loc = (clidr >> 24) & 7;
cacheInvalidateDataCacheLevels(louis, loc);
}
void cacheClearLocalDataCacheOnBoot(void)
{
u32 clidr = (u32)GET_SYSREG(clidr_el1);
u32 louis = (clidr >> 21) & 7;
cacheInvalidateDataCacheLevels(0, louis);
}

58
thermosphere/src/caches.h Normal file
View File

@ -0,0 +1,58 @@
/*
* Copyright (c) 2019 Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "utils.h"
#include "sysreg.h"
static inline u32 cacheGetSmallestInstructionCacheLineSize(void)
{
u32 ctr = (u32)GET_SYSREG(ctr_el0);
u32 shift = ctr & 0xF;
// "log2 of the number of words"...
return 4 << shift;
}
static inline u32 cacheGetSmallestDataCacheLineSize(void)
{
u32 ctr = (u32)GET_SYSREG(ctr_el0);
u32 shift = (ctr >> 16) & 0xF;
// "log2 of the number of words"...
return 4 << shift;
}
static inline void cacheInvalidateInstructionCache(void)
{
__asm__ __volatile__ ("ic ialluis" ::: "memory");
__isb();
}
static inline void cacheInvalidateInstructionCacheLocal(void)
{
__asm__ __volatile__ ("ic iallu" ::: "memory");
__isb();
}
void cacheCleanInvalidateDataCacheRange(const void *addr, size_t size);
void cacheCleanDataCacheRangePoU(const void *addr, size_t size);
void cacheInvalidateInstructionCacheRangePoU(const void *addr, size_t size);
void cacheHandleSelfModifyingCodePoU(const void *addr, size_t size);
void cacheClearSharedDataCachesOnBoot(void);
void cacheClearLocalDataCacheOnBoot(void);

View File

@ -42,6 +42,9 @@ static void initSysregs(void)
SET_SYSREG(cntkctl_el1, 0x00000003); // Don't trap anything for now; event streams disabled
SET_SYSREG(cntp_ctl_el0, 0x00000000);
SET_SYSREG(cntv_ctl_el0, 0x00000000);
__dsb();
__isb();
}
void initSystem(u32 coreId, bool isBootCore, u64 argument)
@ -49,9 +52,6 @@ void initSystem(u32 coreId, bool isBootCore, u64 argument)
coreCtxInit(coreId, isBootCore, argument);
initSysregs();
__dsb_sy();
__isb();
if (isBootCore) {
if (!currentCoreCtx->warmboot) {
memset(__bss_start__, 0, __end__ - __bss_start__);

View File

@ -16,9 +16,7 @@
#include "../utils.h"
#include "../sysreg.h"
#include "../arm.h"
#include "../mmu.h"
#include "../debug_log.h"
#include "memory_map_mmu_cfg.h"
void configureMemoryMapEnableMmu(void)
@ -45,10 +43,24 @@ void configureMemoryMapEnableMmu(void)
*/
u64 mair = 0x4FFull;
flush_dcache_all();
invalidate_icache_all();
// MMU regs config
SET_SYSREG(ttbr0_el2, ttbr0);
SET_SYSREG(tcr_el2, tcr);
SET_SYSREG(mair_el2, mair);
__dsb();
__isb();
set_memory_registers_enable_mmu(ttbr0, tcr, mair);
// TLB invalidation
__tlb_invalidate_el2();
__dsb();
__isb();
// Enable MMU & enable caching
u64 sctlr = GET_SYSREG(sctlr_el2);
sctlr |= SCTLR_ELx_I | SCTLR_ELx_C | SCTLR_ELx_M;
SET_SYSREG(sctlr_el2, sctlr);
__dsb();
__isb();
}
void configureMemoryMapEnableStage2(void)
@ -67,8 +79,22 @@ void configureMemoryMapEnableStage2(void)
- T0SZ = from configureMemoryMap
*/
u64 vtcr = VTCR_EL2_RSVD | TCR_PS(ps) | TCR_TG0_4K | TCR_SHARED_INNER | TCR_ORGN_WBWA | TCR_IRGN_WBWA | VTCR_SL0(1) | TCR_T0SZ(addrSpaceSize);
flush_dcache_all();
invalidate_icache_all();
set_memory_registers_enable_stage2(vttbr, vtcr);
// Stage2 regs config
SET_SYSREG(vttbr_el2, vttbr);
SET_SYSREG(vtcr_el2, vtcr);
__dsb();
__isb();
// TLB invalidation
__tlb_invalidate_el1_stage12();
__dsb();
__isb();
// Enable stage 2
u64 hcr = GET_SYSREG(hcr_el2);
hcr |= HCR_VM;
SET_SYSREG(hcr_el2, hcr);
__dsb();
__isb();
}

View File

@ -1,7 +1,7 @@
#include <string.h>
#include "smc.h"
#include "core_ctx.h"
#include "arm.h"
#include "caches.h"
// Currently in exception_vectors.s:
extern const u32 doSmcIndirectCallImpl[];
@ -16,8 +16,7 @@ void doSmcIndirectCall(ExceptionStackFrame *frame, u32 smcId)
memcpy(codebuf, doSmcIndirectCallImpl, doSmcIndirectCallImplSize);
codebuf[doSmcIndirectCallImplSmcInstructionOffset / 4] |= smcId << 5;
flush_dcache_range(codebuf, codebuf + doSmcIndirectCallImplSize/4);
invalidate_icache_all();
cacheHandleSelfModifyingCodePoU(codebuf, doSmcIndirectCallImplSize/4);
((void (*)(ExceptionStackFrame *))codebuf)(frame);
}

View File

@ -17,7 +17,6 @@
#include <string.h>
#include "software_breakpoints.h"
#include "utils.h"
#include "arm.h"
SoftwareBreakpointManager g_softwareBreakpointManager = {0};
@ -74,14 +73,14 @@ static inline bool doApplySoftwareBreakpoint(size_t id)
static void applySoftwareBreakpointHandler(void *p)
{
u64 flags = maskIrq();
__dmb_sy();
__dmb();
doApplySoftwareBreakpoint(*(size_t *)p);
restoreInterruptFlags(flags);
}
static void applySoftwareBreakpoint(size_t id)
{
__dmb_sy();
__dmb();
executeFunctionOnAllCores(applySoftwareBreakpointHandler, &id, true);
}
@ -103,14 +102,14 @@ static inline bool doRevertSoftwareBreakpoint(size_t id)
static void revertSoftwareBreakpointHandler(void *p)
{
u64 flags = maskIrq();
__dmb_sy();
__dmb();
doRevertSoftwareBreakpoint(*(size_t *)p);
restoreInterruptFlags(flags);
}
static void revertSoftwareBreakpoint(size_t id)
{
__dmb_sy();
__dmb();
executeFunctionOnAllCores(revertSoftwareBreakpointHandler, &id, true);
}

View File

@ -50,8 +50,11 @@ _startCommon:
dsb sy
isb
mov x2, x0
bl cacheClearLocalDataCacheOnBoot
cbz x19, 1f
bl cacheClearSharedDataCachesOnBoot
1:
// Get core ID
// Ensure Aff0 is 4-1 at most (4 cores), and that Aff1, 2 and 3 are 0 (1 cluster only)
mrs x0, mpidr_el1

View File

@ -16,8 +16,8 @@
#include <string.h>
#include "utils.h"
#include "arm.h"
#include "spinlock.h"
#include "caches.h"
__attribute__((noinline)) bool overlaps(u64 as, u64 ae, u64 bs, u64 be)
{
@ -31,6 +31,7 @@ __attribute__((noinline)) bool overlaps(u64 as, u64 ae, u64 bs, u64 be)
// TODO: put that elsewhere
bool readEl1Memory(void *dst, uintptr_t addr, size_t size)
{
// Note: what if we read uncached regions/not shared?
bool valid;
u64 flags = maskIrq();
@ -41,7 +42,6 @@ bool readEl1Memory(void *dst, uintptr_t addr, size_t size)
return false;
}
flush_dcache_range((const void *)pa, (const void *)(pa + size));
memcpy(dst, (const void *)pa, size);
return true;
@ -59,14 +59,12 @@ bool writeEl1Memory(uintptr_t addr, const void *src, size_t size)
return false;
}
flush_dcache_range((const void *)pa, (const void *)(pa + size));
memcpy((void *)pa, src, size);
flush_dcache_range((const void *)pa, (const void *)(pa + size));
invalidate_icache_all();
cacheHandleSelfModifyingCodePoU((const void *)pa, size);
__tlb_invalidate_el1_stage12();
__tlb_invalidate_el1_stage12(); //FIXME FIXME FIXME
__dsb_sy();
__isb();
return true;
}
}

View File

@ -62,6 +62,24 @@ typedef enum ReadWriteDirection {
DIRECTION_READWRITE = DIRECTION_READ | DIRECTION_WRITE,
} ReadWriteDirection;
/*
Domains:
- Inner shareable: typically cores within a cluster (maybe more) with L1+L2 caches
- Outer shareable: all the cores in all clusters that can be coherent
- System: everything else
Since we only support 1 single cluster, we basically only need to consider the inner
shareable domain, except before doing DMA...
*/
static inline void __dmb(void)
{
__asm__ __volatile__ ("dmb ish" ::: "memory");
}
static inline void __dsb(void)
{
__asm__ __volatile__ ("dsb ish" ::: "memory");
}
static inline void __dmb_sy(void)
{
__asm__ __volatile__ ("dmb sy" ::: "memory");
@ -77,6 +95,11 @@ static inline void __isb(void)
__asm__ __volatile__ ("isb" ::: "memory");
}
static inline void __tlb_invalidate_el2(void)
{
__asm__ __volatile__ ("tlbi alle2" ::: "memory");
}
static inline void __tlb_invalidate_el1_stage12(void)
{
__asm__ __volatile__ ("tlbi alle1" ::: "memory");

View File

@ -19,7 +19,6 @@
#include "breakpoints_watchpoints_load.h"
#include "utils.h"
#include "sysreg.h"
#include "arm.h"
#include "debug_log.h"
WatchpointManager g_watchpointManager = {0};
@ -56,7 +55,7 @@ static void commitAndBroadcastWatchpointHandler(void *p)
static inline void commitAndBroadcastWatchpoints(void)
{
__dmb_sy();
__dmb();
executeFunctionOnAllCores(commitAndBroadcastWatchpointHandler, NULL, true);
}