|
|
|
/*
|
|
|
|
* TLB Exception Handling for ARC
|
|
|
|
*
|
|
|
|
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* Vineetg: April 2011 :
|
|
|
|
* -MMU v1: moved out legacy code into a seperate file
|
|
|
|
* -MMU v3: PD{0,1} bits layout changed: They don't overlap anymore,
|
|
|
|
* helps avoid a shift when preparing PD0 from PTE
|
|
|
|
*
|
|
|
|
* Vineetg: July 2009
|
|
|
|
* -For MMU V2, we need not do heuristics at the time of commiting a D-TLB
|
|
|
|
* entry, so that it doesn't knock out it's I-TLB entry
|
|
|
|
* -Some more fine tuning:
|
|
|
|
* bmsk instead of add, asl.cc instead of branch, delay slot utilise etc
|
|
|
|
*
|
|
|
|
* Vineetg: July 2009
|
|
|
|
* -Practically rewrote the I/D TLB Miss handlers
|
|
|
|
* Now 40 and 135 instructions a peice as compared to 131 and 449 resp.
|
|
|
|
* Hence Leaner by 1.5 K
|
|
|
|
* Used Conditional arithmetic to replace excessive branching
|
|
|
|
* Also used short instructions wherever possible
|
|
|
|
*
|
|
|
|
* Vineetg: Aug 13th 2008
|
|
|
|
* -Passing ECR (Exception Cause REG) to do_page_fault( ) for printing
|
|
|
|
* more information in case of a Fatality
|
|
|
|
*
|
|
|
|
* Vineetg: March 25th Bug #92690
|
|
|
|
* -Added Debug Code to check if sw-ASID == hw-ASID
|
|
|
|
|
|
|
|
* Rahul Trivedi, Amit Bhor: Codito Technologies 2004
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/entry.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/arcregs.h>
|
|
|
|
#include <asm/cache.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/tlb-mmu1.h>
|
|
|
|
|
|
|
|
#ifdef CONFIG_ISA_ARCOMPACT
|
|
|
|
;-----------------------------------------------------------------
|
|
|
|
; ARC700 Exception Handling doesn't auto-switch stack and it only provides
|
|
|
|
; ONE scratch AUX reg "ARC_REG_SCRATCH_DATA0"
|
|
|
|
;
|
|
|
|
; For Non-SMP, the scratch AUX reg is repurposed to cache task PGD, so a
|
|
|
|
; "global" is used to free-up FIRST core reg to be able to code the rest of
|
|
|
|
; exception prologue (IRQ auto-disabled on Exceptions, so it's IRQ-safe).
|
|
|
|
; Since the Fast Path TLB Miss handler is coded with 4 regs, the remaining 3
|
|
|
|
; need to be saved as well by extending the "global" to be 4 words. Hence
|
|
|
|
; ".size ex_saved_reg1, 16"
|
|
|
|
; [All of this dance is to avoid stack switching for each TLB Miss, since we
|
|
|
|
; only need to save only a handful of regs, as opposed to complete reg file]
|
|
|
|
;
|
|
|
|
; For ARC700 SMP, the "global" obviously can't be used for free up the FIRST
|
|
|
|
; core reg as it will not be SMP safe.
|
|
|
|
; Thus scratch AUX reg is used (and no longer used to cache task PGD).
|
|
|
|
; To save the rest of 3 regs - per cpu, the global is made "per-cpu".
|
|
|
|
; Epilogue thus has to locate the "per-cpu" storage for regs.
|
|
|
|
; To avoid cache line bouncing the per-cpu global is aligned/sized per
|
|
|
|
; L1_CACHE_SHIFT, despite fundamentally needing to be 12 bytes only. Hence
|
|
|
|
; ".size ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)"
|
|
|
|
|
|
|
|
; As simple as that....
|
|
|
|
;--------------------------------------------------------------------------
|
|
|
|
|
|
|
|
; scratch memory to save [r0-r3] used to code TLB refill Handler
|
|
|
|
ARCFP_DATA ex_saved_reg1
|
|
|
|
.align 1 << L1_CACHE_SHIFT
|
|
|
|
.type ex_saved_reg1, @object
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
.size ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)
|
|
|
|
ex_saved_reg1:
|
|
|
|
.zero (CONFIG_NR_CPUS << L1_CACHE_SHIFT)
|
|
|
|
#else
|
|
|
|
.size ex_saved_reg1, 16
|
|
|
|
ex_saved_reg1:
|
|
|
|
.zero 16
|
|
|
|
#endif
|
|
|
|
|
|
|
|
.macro TLBMISS_FREEUP_REGS
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
sr r0, [ARC_REG_SCRATCH_DATA0] ; freeup r0 to code with
|
|
|
|
GET_CPU_ID r0 ; get to per cpu scratch mem,
|
|
|
|
asl r0, r0, L1_CACHE_SHIFT ; cache line wide per cpu
|
|
|
|
add r0, @ex_saved_reg1, r0
|
|
|
|
#else
|
|
|
|
st r0, [@ex_saved_reg1]
|
|
|
|
mov_s r0, @ex_saved_reg1
|
|
|
|
#endif
|
|
|
|
st_s r1, [r0, 4]
|
|
|
|
st_s r2, [r0, 8]
|
|
|
|
st_s r3, [r0, 12]
|
|
|
|
|
|
|
|
; VERIFY if the ASID in MMU-PID Reg is same as
|
|
|
|
; one in Linux data structures
|
|
|
|
|
|
|
|
tlb_paranoid_check_asm
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro TLBMISS_RESTORE_REGS
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
GET_CPU_ID r0 ; get to per cpu scratch mem
|
|
|
|
asl r0, r0, L1_CACHE_SHIFT ; each is cache line wide
|
|
|
|
add r0, @ex_saved_reg1, r0
|
|
|
|
ld_s r3, [r0,12]
|
|
|
|
ld_s r2, [r0, 8]
|
|
|
|
ld_s r1, [r0, 4]
|
|
|
|
lr r0, [ARC_REG_SCRATCH_DATA0]
|
|
|
|
#else
|
|
|
|
mov_s r0, @ex_saved_reg1
|
|
|
|
ld_s r3, [r0,12]
|
|
|
|
ld_s r2, [r0, 8]
|
|
|
|
ld_s r1, [r0, 4]
|
|
|
|
ld_s r0, [r0]
|
|
|
|
#endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#else /* ARCv2 */
|
|
|
|
|
|
|
|
.macro TLBMISS_FREEUP_REGS
|
|
|
|
PUSH r0
|
|
|
|
PUSH r1
|
|
|
|
PUSH r2
|
|
|
|
PUSH r3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro TLBMISS_RESTORE_REGS
|
|
|
|
POP r3
|
|
|
|
POP r2
|
|
|
|
POP r1
|
|
|
|
POP r0
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
;============================================================================
|
|
|
|
; Troubleshooting Stuff
|
|
|
|
;============================================================================
|
|
|
|
|
|
|
|
; Linux keeps ASID (Address Space ID) in task->active_mm->context.asid
|
|
|
|
; When Creating TLB Entries, instead of doing 3 dependent loads from memory,
|
|
|
|
; we use the MMU PID Reg to get current ASID.
|
|
|
|
; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble.
|
|
|
|
; So we try to detect this in TLB Mis shandler
|
|
|
|
|
|
|
|
.macro tlb_paranoid_check_asm
|
|
|
|
|
|
|
|
#ifdef CONFIG_ARC_DBG_TLB_PARANOIA
|
|
|
|
|
|
|
|
GET_CURR_TASK_ON_CPU r3
|
|
|
|
ld r0, [r3, TASK_ACT_MM]
|
|
|
|
ld r0, [r0, MM_CTXT+MM_CTXT_ASID]
|
ARC: [ASID] Track ASID allocation cycles/generations
This helps remove asid-to-mm reverse map
While mm->context.id contains the ASID assigned to a process, our ASID
allocator also used asid_mm_map[] reverse map. In a new allocation
cycle (mm->ASID >= @asid_cache), the Round Robin ASID allocator used this
to check if new @asid_cache belonged to some mm2 (from prev cycle).
If so, it could locate that mm using the ASID reverse map, and mark that
mm as unallocated ASID, to force it to refresh at the time of switch_mm()
However, for SMP, the reverse map has to be maintained per CPU, so
becomes 2 dimensional, hence got rid of it.
With reverse map gone, it is NOT possible to reach out to current
assignee. So we track the ASID allocation generation/cycle and
on every switch_mm(), check if the current generation of CPU ASID is
same as mm's ASID; If not it is refreshed.
(Based loosely on arch/sh implementation)
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
breq r0, 0, 55f ; Error if no ASID allocated
|
|
|
|
|
|
|
|
lr r1, [ARC_REG_PID]
|
|
|
|
and r1, r1, 0xFF
|
|
|
|
|
ARC: [ASID] Track ASID allocation cycles/generations
This helps remove asid-to-mm reverse map
While mm->context.id contains the ASID assigned to a process, our ASID
allocator also used asid_mm_map[] reverse map. In a new allocation
cycle (mm->ASID >= @asid_cache), the Round Robin ASID allocator used this
to check if new @asid_cache belonged to some mm2 (from prev cycle).
If so, it could locate that mm using the ASID reverse map, and mark that
mm as unallocated ASID, to force it to refresh at the time of switch_mm()
However, for SMP, the reverse map has to be maintained per CPU, so
becomes 2 dimensional, hence got rid of it.
With reverse map gone, it is NOT possible to reach out to current
assignee. So we track the ASID allocation generation/cycle and
on every switch_mm(), check if the current generation of CPU ASID is
same as mm's ASID; If not it is refreshed.
(Based loosely on arch/sh implementation)
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
and r2, r0, 0xFF ; MMU PID bits only for comparison
|
|
|
|
breq r1, r2, 5f
|
|
|
|
|
ARC: [ASID] Track ASID allocation cycles/generations
This helps remove asid-to-mm reverse map
While mm->context.id contains the ASID assigned to a process, our ASID
allocator also used asid_mm_map[] reverse map. In a new allocation
cycle (mm->ASID >= @asid_cache), the Round Robin ASID allocator used this
to check if new @asid_cache belonged to some mm2 (from prev cycle).
If so, it could locate that mm using the ASID reverse map, and mark that
mm as unallocated ASID, to force it to refresh at the time of switch_mm()
However, for SMP, the reverse map has to be maintained per CPU, so
becomes 2 dimensional, hence got rid of it.
With reverse map gone, it is NOT possible to reach out to current
assignee. So we track the ASID allocation generation/cycle and
on every switch_mm(), check if the current generation of CPU ASID is
same as mm's ASID; If not it is refreshed.
(Based loosely on arch/sh implementation)
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
55:
|
|
|
|
; Error if H/w and S/w ASID don't match, but NOT if in kernel mode
|
|
|
|
lr r2, [erstatus]
|
|
|
|
bbit0 r2, STATUS_U_BIT, 5f
|
|
|
|
|
|
|
|
; We sure are in troubled waters, Flag the error, but to do so
|
|
|
|
; need to switch to kernel mode stack to call error routine
|
|
|
|
GET_TSK_STACK_BASE r3, sp
|
|
|
|
|
|
|
|
; Call printk to shoutout aloud
|
|
|
|
mov r2, 1
|
|
|
|
j print_asid_mismatch
|
|
|
|
|
|
|
|
5: ; ASIDs match so proceed normally
|
|
|
|
nop
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
.endm
|
|
|
|
|
|
|
|
;============================================================================
|
|
|
|
;TLB Miss handling Code
|
|
|
|
;============================================================================
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
; This macro does the page-table lookup for the faulting address.
|
|
|
|
; OUT: r0 = PTE faulted on, r1 = ptr to PTE, r2 = Faulting V-address
|
|
|
|
.macro LOAD_FAULT_PTE
|
|
|
|
|
|
|
|
lr r2, [efa]
|
|
|
|
|
|
|
|
#ifndef CONFIG_SMP
|
|
|
|
lr r1, [ARC_REG_SCRATCH_DATA0] ; current pgd
|
|
|
|
#else
|
|
|
|
GET_CURR_TASK_ON_CPU r1
|
|
|
|
ld r1, [r1, TASK_ACT_MM]
|
|
|
|
ld r1, [r1, MM_PGD]
|
|
|
|
#endif
|
|
|
|
|
|
|
|
lsr r0, r2, PGDIR_SHIFT ; Bits for indexing into PGD
|
|
|
|
ld.as r3, [r1, r0] ; PGD entry corresp to faulting addr
|
|
|
|
tst r3, r3
|
|
|
|
bz do_slow_path_pf ; if no Page Table, do page fault
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
and.f 0, r3, _PAGE_HW_SZ ; Is this Huge PMD (thp)
|
|
|
|
add2.nz r1, r1, r0
|
|
|
|
bnz.d 2f ; YES: PGD == PMD has THP PTE: stop pgd walk
|
|
|
|
mov.nz r0, r3
|
|
|
|
|
|
|
|
#endif
|
|
|
|
and r1, r3, PAGE_MASK
|
|
|
|
|
|
|
|
; Get the PTE entry: The idea is
|
|
|
|
; (1) x = addr >> PAGE_SHIFT -> masks page-off bits from @fault-addr
|
|
|
|
; (2) y = x & (PTRS_PER_PTE - 1) -> to get index
|
|
|
|
; (3) z = (pgtbl + y * 4)
|
|
|
|
|
|
|
|
#ifdef CONFIG_ARC_HAS_PAE40
|
|
|
|
#define PTE_SIZE_LOG 3 /* 8 == 2 ^ 3 */
|
|
|
|
#else
|
|
|
|
#define PTE_SIZE_LOG 2 /* 4 == 2 ^ 2 */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
; multiply in step (3) above avoided by shifting lesser in step (1)
|
|
|
|
lsr r0, r2, ( PAGE_SHIFT - PTE_SIZE_LOG )
|
|
|
|
and r0, r0, ( (PTRS_PER_PTE - 1) << PTE_SIZE_LOG )
|
|
|
|
ld.aw r0, [r1, r0] ; r0: PTE (lower word only for PAE40)
|
|
|
|
; r1: PTE ptr
|
|
|
|
|
|
|
|
2:
|
|
|
|
|
|
|
|
.endm
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------
|
|
|
|
; Convert Linux PTE entry into TLB entry
|
|
|
|
; A one-word PTE entry is programmed as two-word TLB Entry [PD0:PD1] in mmu
|
|
|
|
; (for PAE40, two-words PTE, while three-word TLB Entry [PD0:PD1:PD1HI])
|
|
|
|
; IN: r0 = PTE, r1 = ptr to PTE
|
|
|
|
|
|
|
|
.macro CONV_PTE_TO_TLB
|
|
|
|
and r3, r0, PTE_BITS_RWX ; r w x
|
|
|
|
asl r2, r3, 3 ; Kr Kw Kx 0 0 0 (GLOBAL, kernel only)
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
and.f 0, r0, _PAGE_GLOBAL
|
|
|
|
or.z r2, r2, r3 ; Kr Kw Kx Ur Uw Ux (!GLOBAL, user page)
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
|
|
|
|
and r3, r0, PTE_BITS_NON_RWX_IN_PD1 ; Extract PFN+cache bits from PTE
|
|
|
|
or r3, r3, r2
|
|
|
|
|
|
|
|
sr r3, [ARC_REG_TLBPD1] ; paddr[31..13] | Kr Kw Kx Ur Uw Ux | C
|
|
|
|
#ifdef CONFIG_ARC_HAS_PAE40
|
|
|
|
ld r3, [r1, 4] ; paddr[39..32]
|
|
|
|
sr r3, [ARC_REG_TLBPD1HI]
|
|
|
|
#endif
|
|
|
|
|
|
|
|
and r2, r0, PTE_BITS_IN_PD0 ; Extract other PTE flags: (V)alid, (G)lb
|
|
|
|
|
|
|
|
lr r3,[ARC_REG_TLBPD0] ; MMU prepares PD0 with vaddr and asid
|
|
|
|
|
|
|
|
or r3, r3, r2 ; S | vaddr | {sasid|asid}
|
|
|
|
sr r3,[ARC_REG_TLBPD0] ; rewrite PD0
|
|
|
|
.endm
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------
|
|
|
|
; Commit the TLB entry into MMU
|
|
|
|
|
|
|
|
.macro COMMIT_ENTRY_TO_MMU
|
|
|
|
#if (CONFIG_ARC_MMU_VER < 4)
|
|
|
|
|
|
|
|
#ifdef CONFIG_EZNPS_MTM_EXT
|
|
|
|
/* verify if entry for this vaddr+ASID already exists */
|
|
|
|
sr TLBProbe, [ARC_REG_TLBCOMMAND]
|
|
|
|
lr r0, [ARC_REG_TLBINDEX]
|
|
|
|
bbit0 r0, 31, 88f
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Get free TLB slot: Set = computed from vaddr, way = random */
|
|
|
|
sr TLBGetIndex, [ARC_REG_TLBCOMMAND]
|
|
|
|
|
|
|
|
/* Commit the Write */
|
|
|
|
#if (CONFIG_ARC_MMU_VER >= 2) /* introduced in v2 */
|
|
|
|
sr TLBWriteNI, [ARC_REG_TLBCOMMAND]
|
|
|
|
#else
|
|
|
|
sr TLBWrite, [ARC_REG_TLBCOMMAND]
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#else
|
|
|
|
sr TLBInsertEntry, [ARC_REG_TLBCOMMAND]
|
|
|
|
#endif
|
|
|
|
|
|
|
|
88:
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
|
|
ARCFP_CODE ;Fast Path Code, candidate for ICCM
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
; I-TLB Miss Exception Handler
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
ENTRY(EV_TLBMissI)
|
|
|
|
|
|
|
|
TLBMISS_FREEUP_REGS
|
|
|
|
|
|
|
|
;----------------------------------------------------------------
|
|
|
|
; Get the PTE corresponding to V-addr accessed, r2 is setup with EFA
|
|
|
|
LOAD_FAULT_PTE
|
|
|
|
|
|
|
|
;----------------------------------------------------------------
|
|
|
|
; VERIFY_PTE: Check if PTE permissions approp for executing code
|
|
|
|
cmp_s r2, VMALLOC_START
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
mov_s r2, (_PAGE_PRESENT | _PAGE_EXECUTE)
|
|
|
|
or.hs r2, r2, _PAGE_GLOBAL
|
|
|
|
|
|
|
|
and r3, r0, r2 ; Mask out NON Flag bits from PTE
|
|
|
|
xor.f r3, r3, r2 ; check ( ( pte & flags_test ) == flags_test )
|
|
|
|
bnz do_slow_path_pf
|
|
|
|
|
|
|
|
; Let Linux VM know that the page was accessed
|
|
|
|
or r0, r0, _PAGE_ACCESSED ; set Accessed Bit
|
|
|
|
st_s r0, [r1] ; Write back PTE
|
|
|
|
|
|
|
|
CONV_PTE_TO_TLB
|
|
|
|
COMMIT_ENTRY_TO_MMU
|
|
|
|
TLBMISS_RESTORE_REGS
|
|
|
|
EV_TLBMissI_fast_ret: ; additional label for VDK OS-kit instrumentation
|
|
|
|
rtie
|
|
|
|
|
|
|
|
END(EV_TLBMissI)
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
; D-TLB Miss Exception Handler
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
ENTRY(EV_TLBMissD)
|
|
|
|
|
|
|
|
TLBMISS_FREEUP_REGS
|
|
|
|
|
|
|
|
;----------------------------------------------------------------
|
|
|
|
; Get the PTE corresponding to V-addr accessed
|
|
|
|
; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE, r2 = EFA
|
|
|
|
LOAD_FAULT_PTE
|
|
|
|
|
|
|
|
;----------------------------------------------------------------
|
|
|
|
; VERIFY_PTE: Chk if PTE permissions approp for data access (R/W/R+W)
|
|
|
|
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
cmp_s r2, VMALLOC_START
|
|
|
|
mov_s r2, _PAGE_PRESENT ; common bit for K/U PTE
|
|
|
|
or.hs r2, r2, _PAGE_GLOBAL ; kernel PTE only
|
|
|
|
|
|
|
|
; Linux PTE [RWX] bits are semantically overloaded:
|
|
|
|
; -If PAGE_GLOBAL set, they refer to kernel-only flags (vmalloc)
|
|
|
|
; -Otherwise they are user-mode permissions, and those are exactly
|
|
|
|
; same for kernel mode as well (e.g. copy_(to|from)_user)
|
|
|
|
|
|
|
|
lr r3, [ecr]
|
|
|
|
btst_s r3, ECR_C_BIT_DTLB_LD_MISS ; Read Access
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
or.nz r2, r2, _PAGE_READ ; chk for Read flag in PTE
|
|
|
|
btst_s r3, ECR_C_BIT_DTLB_ST_MISS ; Write Access
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
12 years ago
|
|
|
or.nz r2, r2, _PAGE_WRITE ; chk for Write flag in PTE
|
|
|
|
; Above laddering takes care of XCHG access (both R and W)
|
|
|
|
|
|
|
|
; By now, r2 setup with all the Flags we need to check in PTE
|
|
|
|
and r3, r0, r2 ; Mask out NON Flag bits from PTE
|
|
|
|
brne.d r3, r2, do_slow_path_pf ; is ((pte & flags_test) == flags_test)
|
|
|
|
|
|
|
|
;----------------------------------------------------------------
|
|
|
|
; UPDATE_PTE: Let Linux VM know that page was accessed/dirty
|
|
|
|
lr r3, [ecr]
|
|
|
|
or r0, r0, _PAGE_ACCESSED ; Accessed bit always
|
|
|
|
btst_s r3, ECR_C_BIT_DTLB_ST_MISS ; See if it was a Write Access ?
|
|
|
|
or.nz r0, r0, _PAGE_DIRTY ; if Write, set Dirty bit as well
|
|
|
|
st_s r0, [r1] ; Write back PTE
|
|
|
|
|
|
|
|
CONV_PTE_TO_TLB
|
|
|
|
|
|
|
|
#if (CONFIG_ARC_MMU_VER == 1)
|
|
|
|
; MMU with 2 way set assoc J-TLB, needs some help in pathetic case of
|
|
|
|
; memcpy where 3 parties contend for 2 ways, ensuing a livelock.
|
|
|
|
; But only for old MMU or one with Metal Fix
|
|
|
|
TLB_WRITE_HEURISTICS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
COMMIT_ENTRY_TO_MMU
|
|
|
|
TLBMISS_RESTORE_REGS
|
|
|
|
EV_TLBMissD_fast_ret: ; additional label for VDK OS-kit instrumentation
|
|
|
|
rtie
|
|
|
|
|
|
|
|
;-------- Common routine to call Linux Page Fault Handler -----------
|
|
|
|
do_slow_path_pf:
|
|
|
|
|
|
|
|
; Restore the 4-scratch regs saved by fast path miss handler
|
|
|
|
TLBMISS_RESTORE_REGS
|
|
|
|
|
|
|
|
; Slow path TLB Miss handled as a regular ARC Exception
|
|
|
|
; (stack switching / save the complete reg-file).
|
|
|
|
b call_do_page_fault
|
|
|
|
END(EV_TLBMissD)
|