hash table
Benjamin Herrenschmidt
benh at kernel.crashing.org
Tue Dec 9 18:56:35 EST 2003
Here's a first shot at my rework of __hash_page, if you want
to have a quick look... It did a few tests but didn't really
stress the box that badly, so there may be bugs in there.
At this point, the goal isn't (yet) perfs, it is to get rid
of the page table lock in hash_page().
(though my simple tests showed an approximate 10% improvement of hash_page duration).
There is still room for optimisation. It would be nice for example
to move the lazy cache flush to asm to avoid the overhead of function
calls & additional stackframe, and I could rewrite the non-HV
verions of the low level ppc_md. functions in asm with some wins
I supposed looking at the C code... There is also room for
optimisation in my asm code (like some bit manipulations
or better scheduling).
I think there is no race with the flush code. The reason is that
the case where flush is called on a present page seem to be strictly
limited to a PP bits update (or an accessed bits update in some
error case, but we can dismiss that one completely I beleive).
Since flush uses pte_update, it will not race with a pending
_hash_page. The only possible race is a __hash_page occuring during
a flush. But in this case, the PTE will have the new PP bits already
so at worst, we exit flush with an entry present... but that has the
new bits. So it's ok. I don't think we can race on the content of
the HPTE neither as we have the HPTE lock bit there. I'd still
like your point of view though.
Ben.
diff -urN linux-g5-ppc64/arch/ppc64/kernel/htab.c linux-g5-htab/arch/ppc64/kernel/htab.c
--- linux-g5-ppc64/arch/ppc64/kernel/htab.c 2003-12-08 20:27:20.084329896 +1100
+++ linux-g5-htab/arch/ppc64/kernel/htab.c 2003-12-09 18:15:10.064315512 +1100
@@ -27,6 +27,7 @@
#include <linux/sysctl.h>
#include <linux/ctype.h>
#include <linux/cache.h>
+#include <linux/init.h>
#include <asm/ppcdebug.h>
#include <asm/processor.h>
@@ -129,7 +130,7 @@
}
}
-void
+void __init
htab_initialize(void)
{
unsigned long table, htab_size_bytes;
@@ -231,6 +232,47 @@
}
/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+ struct page *page;
+
+#define PPC64_HWNOEXEC (1 << 2)
+
+ if (!pfn_valid(pte_pfn(pte)))
+ return pp;
+
+ page = pte_page(pte);
+
+ /* page is dirty */
+ if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) {
+ if (trap == 0x400) {
+ __flush_dcache_icache(page_address(page));
+ set_bit(PG_arch_1, &page->flags);
+ } else
+ pp |= PPC64_HWNOEXEC;
+ }
+ return pp;
+}
+
+/*
+ * Called by asm hashtable.S in case of critical insert failure
+ */
+void htab_insert_failure(void)
+{
+ panic("hash_page: pte_insert failed\n");
+}
+
+/*
+ * Handle a fault by adding an HPTE. If the address can't be determined
+ * to be valid via Linux page tables, return 1. If handled return 0
+ */
+extern int __hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, int local);
+
+#if 0
+/*
* Handle a fault by adding an HPTE. If the address can't be determined
* to be valid via Linux page tables, return 1. If handled return 0
*/
@@ -380,6 +422,7 @@
return 0;
}
+#endif
int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
{
@@ -444,24 +487,20 @@
if (pgdir == NULL)
return 1;
- /*
- * Lock the Linux page table to prevent mmap and kswapd
- * from modifying entries while we search and update
- */
- spin_lock(&mm->page_table_lock);
-
tmp = cpumask_of_cpu(smp_processor_id());
if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
local = 1;
- ret = hash_huge_page(mm, access, ea, vsid, local);
- if (ret < 0) {
+ /* Is this a huge page ? */
+ if (unlikely(in_hugepage_area(mm->context, ea)))
+ ret = hash_huge_page(mm, access, ea, vsid, local);
+ else {
ptep = find_linux_pte(pgdir, ea);
+ if (ptep == NULL)
+ return 1;
ret = __hash_page(ea, access, vsid, ptep, trap, local);
}
- spin_unlock(&mm->page_table_lock);
-
#ifdef CONFIG_HTABLE_STATS
if (ret == 0) {
duration = mftb() - duration;
@@ -519,3 +558,26 @@
local);
}
}
+
+static inline void make_bl(unsigned int *insn_addr, void *func)
+{
+ unsigned long funcp = *((unsigned long *)func);
+ int offset = funcp - (unsigned long)insn_addr;
+
+ *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
+ flush_icache_range((unsigned long)insn_addr, 4+
+ (unsigned long)insn_addr);
+}
+
+void __init htab_finish_init(void)
+{
+ extern unsigned int *htab_call_hpte_insert1;
+ extern unsigned int *htab_call_hpte_insert2;
+ extern unsigned int *htab_call_hpte_remove;
+ extern unsigned int *htab_call_hpte_updatepp;
+
+ make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
+ make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
+ make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
+ make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+}
diff -urN linux-g5-ppc64/arch/ppc64/kernel/setup.c linux-g5-htab/arch/ppc64/kernel/setup.c
--- linux-g5-ppc64/arch/ppc64/kernel/setup.c 2003-12-08 20:15:07.922635392 +1100
+++ linux-g5-htab/arch/ppc64/kernel/setup.c 2003-12-09 18:14:21.331723992 +1100
@@ -246,6 +246,10 @@
pmac_init(r3, r4, r5, r6, r7);
}
#endif
+ /* Finish initializing the hash table (do the dynamic
+ * patching for the fast-path hashtable.S code)
+ */
+ htab_finish_init();
printk("Starting Linux PPC64 %s\n", UTS_RELEASE);
diff -urN linux-g5-ppc64/arch/ppc64/mm/Makefile linux-g5-htab/arch/ppc64/mm/Makefile
--- linux-g5-ppc64/arch/ppc64/mm/Makefile 2003-11-19 21:20:09.000000000 +1100
+++ linux-g5-htab/arch/ppc64/mm/Makefile 2003-12-08 17:33:31.452722880 +1100
@@ -4,6 +4,6 @@
EXTRA_CFLAGS += -mno-minimal-toc
-obj-y := fault.o init.o extable.o imalloc.o
+obj-y := fault.o init.o extable.o imalloc.o hashtable.o
obj-$(CONFIG_DISCONTIGMEM) += numa.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff -urN linux-g5-ppc64/arch/ppc64/mm/hashtable.S linux-g5-htab/arch/ppc64/mm/hashtable.S
--- linux-g5-ppc64/arch/ppc64/mm/hashtable.S Thu Jan 01 10:00:00 1970
+++ linux-g5-htab/arch/ppc64/mm/hashtable.S Tue Dec 09 18:54:52 2003
@@ -0,0 +1,289 @@
+/*
+ * ppc64 MMU hashtable management routines
+ *
+ * (c) Copyright IBM Corp. 2003
+ *
+ * Maintained by: Benjamin Herrenschmidt
+ * <benh at kernel.crashing.org>
+ *
+ * This file is covered by the GNU Public Licence v2 as
+ * described in the kernel's COPYING file.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cputable.h>
+
+ .text
+
+/*
+ * Stackframe:
+ *
+ * +-> Back chain (SP + 256)
+ * | General register save area (SP + 112)
+ * | Parameter save area (SP + 48)
+ * | TOC save area (SP + 40)
+ * | link editor doubleword (SP + 32)
+ * | compiler doubleword (SP + 24)
+ * | LR save area (SP + 16)
+ * | CR save area (SP + 8)
+ * SP ---> +-- Back chain (SP + 0)
+ */
+#define STACKFRAMESIZE 256
+
+/* Save parameters offsets */
+#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
+
+/* Save non-volatile offsets */
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+/*
+ * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
+ * pte_t *ptep, unsigned long trap, int local)
+ *
+ * Adds a page to the hash table. This is the non-LPAR version for now
+ */
+
+_GLOBAL(__hash_page)
+ mflr r0
+ std r0,16(r1)
+ stdu r1,-STACKFRAMESIZE(r1)
+ /* Save all params that we need after a function call */
+ std r6,STK_PARM(r6)(r1)
+ std r8,STK_PARM(r8)(r1)
+
+ /* Add _PAGE_PRESENT to access */
+ ori r4,r4,_PAGE_PRESENT
+
+ /* Save non-volatile registers.
+ * r31 will hold "old PTE"
+ * r30 is "new PTE"
+ * r29 is "va"
+ * r28 is a hash value
+ * r27 is hashtab mask (maybe dynamic patched instead ?)
+ */
+ std r27,STK_REG(r27)(r1)
+ std r28,STK_REG(r28)(r1)
+ std r29,STK_REG(r29)(r1)
+ std r30,STK_REG(r30)(r1)
+ std r31,STK_REG(r31)(r1)
+
+ /* Step 1:
+ *
+ * Check permissions, atomically mark the linux PTE busy
+ * and hashed.
+ */
+1:
+ ldarx r31,0,r6
+ /* Check access rights (access & ~(pte_val(*ptep))) */
+ andc. r0,r4,r31
+ bne- htab_wrong_access
+ /* Check if PTE is busy */
+ andi. r0,r31,_PAGE_BUSY
+ bne- 1b
+ /* Prepare new PTE value (turn access RW into DIRTY, then
+ * add BUSY,HASHPTE and ACCESSED)
+ */
+ rlwinm r30,r4,5,24,24 /* _PAGE_RW -> _PAGE_DIRTY */
+ or r30,r30,r31
+ ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+ /* Write the linux PTE atomically (setting busy) */
+ stdcx. r30,0,r6
+ bne- 1b
+
+
+ /* Step 2:
+ *
+ * Insert/Update the HPTE in the hash table. At this point,
+ * r4 (access) is re-useable, we use it for the new HPTE flags
+ */
+
+ /* Calc va and put it in r29 */
+ rldicr r29,r5,28,63-28
+ rldicl r3,r3,0,36
+ or r29,r3,r29
+
+ /* Calculate hash value for primary slot and store it in r28 */
+ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
+ rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
+ xor r28,r5,r0
+
+ /* Convert linux PTE bits into HW equivalents. Fix using
+ * mask inserts instead
+ */
+ rlwinm r3,r30,32-1,31,31 /* _PAGE_USER -> PP lsb */
+ rlwinm r0,r30,32-2,31,31 /* _PAGE_RW -> r0 lsb */
+ rlwinm r4,r30,32-7,31,31 /* _PAGE_DIRTY -> r4 lsb */
+ and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 lsb */
+ andc r3,r3,r0 /* PP lsb &= ~(PAGE_RW & _PAGE_DIRTY) */
+ andi. r4,r30,_PAGE_USER /* _PAGE_USER -> r4 msb */
+ or r3,r3,r4 /* PP msb = r4 msb */
+ andi. r0,r30,0x1f8 /* Add in other flags */
+ or r3,r3,r0
+
+ /* We eventually do the icache sync here (maybe inline that
+ * code rather than call a C function...
+ */
+BEGIN_FTR_SECTION
+ mr r4,r30
+ mr r5,r7
+ bl .hash_page_do_lazy_icache
+END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
+
+ /* At this point, r3 contains new PP bits, save them in
+ * place of "access" in the param area (sic)
+ */
+ std r3,STK_PARM(r4)(r1)
+
+ /* Get htab_hash_mask */
+ ld r4,htab_data at got(2)
+ ld r27,16(r4) /* htab_data.htab_hash_mask -> r27 */
+
+ /* Check if we may already be in the hashtable, in this case, we
+ * go to out-of-line code to try to modify the HPTE
+ */
+ andi. r0,r31,_PAGE_HASHPTE
+ bne htab_modify_pte
+
+htab_insert_pte:
+ /* Clear hpte bits in new pte (we also clear BUSY btw) and
+ * add _PAGE_HASHPTE
+ */
+ lis r0,_PAGE_HPTEFLAGS at h
+ ori r0,r0,_PAGE_HPTEFLAGS at l
+ andc r30,r30,r0
+ ori r30,r30,_PAGE_HASHPTE
+
+1:
+ /* page number in r5 */
+ rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+ /* Calculate primary group hash */
+ and r0,r28,r27
+ rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+
+ /* Call ppc_md.hpte_insert */
+ ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
+ mr r4,r29 /* Retreive va */
+ li r6,0 /* primary slot *
+ li r8,0 /* not bolted and not large */
+ li r9,0
+_GLOBAL(htab_call_hpte_insert1)
+ bl . /* Will be patched by htab_finish_init() */
+ cmpi 0,r3,0
+ bge htab_pte_insert_ok /* Insertion successful */
+ cmpi 0,r3,-2 /* Critical failure */
+ beq- htab_pte_insert_failure
+
+ /* Now try secondary slot */
+ ori r30,r30,_PAGE_SECONDARY
+
+ /* page number in r5 */
+ rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+ /* Calculate secondary group hash */
+ not r3,r28
+ and r0,r3,r27
+ rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
+
+ /* Call ppc_md.hpte_insert */
+ ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
+ mr r4,r29 /* Retreive va */
+ li r6,1 /* secondary slot *
+ li r8,0 /* not bolted and not large */
+ li r9,0
+_GLOBAL(htab_call_hpte_insert2)
+ bl . /* Will be patched by htab_finish_init() */
+ cmpi 0,r3,0
+ bge+ htab_pte_insert_ok /* Insertion successful */
+ cmpi 0,r3,-2 /* Critical failure */
+ beq- htab_pte_insert_failure
+
+ /* Both are full, we need to evict something */
+ mftb r0
+ /* Pick a random group based on TB */
+ andi. r0,r0,1
+ mr r5,r28
+ bne 2f
+ not r5,r5
+2: and r0,r5,r27
+ rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+ /* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+ bl . /* Will be patched by htab_finish_init() */
+
+ /* Try all again */
+ b 1b
+
+htab_pte_insert_ok:
+ /* Insert slot number in PTE */
+ rldimi r30,r3,12,63-14
+
+ /* Write out the PTE with a normal write
+ * (maybe add eieio may be good still ?)
+ */
+htab_write_out_pte:
+ ld r6,STK_PARM(r6)(r1)
+ std r30,0(r6)
+ li r3, 0
+bail:
+ ld r27,STK_REG(r27)(r1)
+ ld r28,STK_REG(r28)(r1)
+ ld r29,STK_REG(r29)(r1)
+ ld r30,STK_REG(r30)(r1)
+ ld r31,STK_REG(r31)(r1)
+ addi r1,r1,STACKFRAMESIZE
+ ld r0,16(r1)
+ mtlr r0
+ blr
+
+htab_modify_pte:
+ /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+ mr r4,r3
+ rlwinm r3,r31,32-12,29,31
+
+ /* Secondary group ? if yes, get a inverted hash value */
+ mr r5,r28
+ andi. r0,r31,_PAGE_SECONDARY
+ beq 1f
+ not r5,r5
+1:
+ /* Calculate proper slot value for ppc_md.hpte_updatepp */
+ and r0,r5,r27
+ rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+ add r3,r0,r3 /* add slot idx */
+
+ /* Call ppc_md.hpte_updatepp */
+ mr r5,r29 /* va */
+ li r6,0 /* large is 0 */
+ ld r7,STK_PARM(r8)(r1) /* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+ bl . /* Will be patched by htab_finish_init() */
+
+ /* if we failed because typically the HPTE wasn't really here
+ * we try an insertion.
+ */
+ cmpi 0,r3,-1
+ beq- htab_insert_pte
+
+ /* Clear the BUSY bit and Write out the PTE */
+ li r0,_PAGE_BUSY
+ andc r30,r30,r0
+ b htab_write_out_pte
+
+htab_wrong_access:
+ /* Bail out clearing reservation */
+ stdcx. r31,0,r6
+ li r3,1
+ b bail
+
+htab_pte_insert_failure:
+ b .htab_insert_failure
+
+
diff -urN linux-g5-ppc64/arch/ppc64/mm/hugetlbpage.c linux-g5-htab/arch/ppc64/mm/hugetlbpage.c
--- linux-g5-ppc64/arch/ppc64/mm/hugetlbpage.c 2003-12-02 13:11:59.000000000 +1100
+++ linux-g5-htab/arch/ppc64/mm/hugetlbpage.c 2003-12-08 15:52:18.100012832 +1100
@@ -655,10 +655,6 @@
unsigned long hpteflags, prpn;
long slot;
- /* Is this for us? */
- if (!in_hugepage_area(mm->context, ea))
- return -1;
-
ea &= ~(HPAGE_SIZE-1);
/* We have to find the first hugepte in the batch, since
diff -urN linux-g5-ppc64/include/asm-ppc64/mmu.h linux-g5-htab/include/asm-ppc64/mmu.h
--- linux-g5-ppc64/include/asm-ppc64/mmu.h 2003-12-01 14:40:29.000000000 +1100
+++ linux-g5-htab/include/asm-ppc64/mmu.h 2003-12-09 17:23:43.436554256 +1100
@@ -225,6 +225,8 @@
asm volatile("ptesync": : :"memory");
}
+extern void htab_finish_init(void);
+
#endif /* __ASSEMBLY__ */
/*
diff -urN linux-g5-ppc64/include/asm-ppc64/pgtable.h linux-g5-htab/include/asm-ppc64/pgtable.h
--- linux-g5-ppc64/include/asm-ppc64/pgtable.h 2003-12-05 13:58:59.000000000 +1100
+++ linux-g5-htab/include/asm-ppc64/pgtable.h 2003-12-09 13:57:28.174879992 +1100
@@ -74,22 +74,23 @@
* Bits in a linux-style PTE. These match the bits in the
* (hardware-defined) PowerPC PTE as closely as possible.
*/
-#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */
-#define _PAGE_USER 0x002UL /* matches one of the PP bits */
-#define _PAGE_RW 0x004UL /* software: user write access allowed */
-#define _PAGE_GUARDED 0x008UL
-#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */
-#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */
-#define _PAGE_DIRTY 0x080UL /* C: page changed */
-#define _PAGE_ACCESSED 0x100UL /* R: page referenced */
-#define _PAGE_FILE 0x200UL /* software: pte holds file offset */
-#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */
-#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */
+#define _PAGE_USER 0x0002 /* matches one of the PP bits */
+#define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */
+#define _PAGE_RW 0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED 0x0008
+#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY 0x0080 /* C: page changed */
+#define _PAGE_ACCESSED 0x0100 /* R: page referenced */
+#define _PAGE_EXEC 0x0200 /* software: i-cache coherence required */
+#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
+#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */
/* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
/* PAGE_MASK gives the right answer below, but only by accident */
/* It should be preserving the high 48 bits and then specifically */
/* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
@@ -157,8 +158,10 @@
#define _PMD_HUGEPAGE 0x00000001U
#define HUGEPTE_BATCH_SIZE (1<<(HPAGE_SHIFT-PMD_SHIFT))
+#ifndef __ASSEMBLY__
int hash_huge_page(struct mm_struct *mm, unsigned long access,
unsigned long ea, unsigned long vsid, int local);
+#endif /* __ASSEMBLY__ */
#define HAVE_ARCH_UNMAPPED_AREA
#else
@@ -288,9 +291,12 @@
unsigned long set )
{
unsigned long old, tmp;
-
+ extern void udbg_putc(unsigned char c);
+
__asm__ __volatile__(
"1: ldarx %0,0,%3 # pte_update\n\
+ andi. %1,%0,0x0800 \n\
+ bne- 1b \n\
andc %1,%0,%4 \n\
or %1,%1,%5 \n\
stdcx. %1,0,%3 \n\
** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/
More information about the Linuxppc64-dev
mailing list