Dynamic hugepage for ppc64
David Gibson
david at gibson.dropbear.id.au
Tue Jun 7 15:15:05 EST 2005
Presently, 64-bit applications on ppc64 may only use hugepages in the
address region from 1-1.5T. Furthermore, if hugepages are enabled in
the kernel config, they may only use hugepages and never normal pages
in this area. This patch relaxes this restriction, allowing any
address to be used with hugepages, but with a 1TB granularity. That
is if you map a hugepage anywhere in the region 1TB-2TB, that entire
area will be reserved exclusively for hugepages for the remainder of
the process's lifetime. This works analagously to hugepages in 32-bit
applications, where hugepages can be mapped anywhere, but with 256MB
(mmu segment) granularity.
This patch must be applied on top of my 4-level pagetables patch.
Index: working-2.6/arch/ppc64/kernel/asm-offsets.c
===================================================================
--- working-2.6.orig/arch/ppc64/kernel/asm-offsets.c 2005-05-24 14:12:22.000000000 +1000
+++ working-2.6/arch/ppc64/kernel/asm-offsets.c 2005-06-07 13:34:59.000000000 +1000
@@ -95,7 +95,8 @@
DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
#ifdef CONFIG_HUGETLB_PAGE
- DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs));
+ DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
+ DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
#endif /* CONFIG_HUGETLB_PAGE */
DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr));
DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
Index: working-2.6/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hugetlbpage.c 2005-06-07 13:26:15.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hugetlbpage.c 2005-06-07 13:45:01.000000000 +1000
@@ -131,15 +131,15 @@
return 0;
}
-static void flush_segments(void *parm)
+static void flush_low_segments(void *parm)
{
- u16 segs = (unsigned long) parm;
+ u16 areas = (unsigned long) parm;
unsigned long i;
asm volatile("isync" : : : "memory");
for (i = 0; i < 16; i++) {
- if (! (segs & (1U << i)))
+ if (! (areas & (1U << i)))
continue;
asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
}
@@ -147,6 +147,24 @@
asm volatile("isync" : : : "memory");
}
+static void flush_high_segments(void *parm)
+{
+ u16 areas = (unsigned long) parm;
+ unsigned long i, j;
+
+ asm volatile("isync" : : : "memory");
+
+ for (i = 0; i < 16; i++) {
+ if (! (areas & (1U << i)))
+ continue;
+ for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+ asm volatile("slbie %0"
+ :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
+ }
+
+ asm volatile("isync" : : : "memory");
+}
+
static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
{
unsigned long start = seg << SID_SHIFT;
@@ -163,20 +181,36 @@
return 0;
}
-static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+static int prepare_high_zone_for_htlb(struct mm_struct *mm, unsigned long zone)
+{
+ unsigned long start = zone << HTLB_AREA_SHIFT;
+ unsigned long end = (zone+1) << HTLB_AREA_SHIFT;
+ struct vm_area_struct *vma;
+
+ BUG_ON(zone >= 16);
+
+ /* Check no VMAs are in the region */
+ vma = find_vma(mm, start);
+ if (vma && (vma->vm_start < end))
+ return -EBUSY;
+
+ return 0;
+}
+
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
{
unsigned long i;
- newsegs &= ~(mm->context.htlb_segs);
- if (! newsegs)
+ newareas &= ~(mm->context.low_htlb_areas);
+ if (! newareas)
return 0; /* The segments we want are already open */
for (i = 0; i < 16; i++)
- if ((1 << i) & newsegs)
+ if ((1 << i) & newareas)
if (prepare_low_seg_for_htlb(mm, i) != 0)
return -EBUSY;
- mm->context.htlb_segs |= newsegs;
+ mm->context.low_htlb_areas |= newareas;
/* update the paca copy of the context struct */
get_paca()->context = mm->context;
@@ -184,29 +218,58 @@
/* the context change must make it to memory before the flush,
* so that further SLB misses do the right thing. */
mb();
- on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+ on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+ return 0;
+}
+
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+ unsigned long i;
+
+ newareas &= ~(mm->context.high_htlb_areas);
+ if (! newareas)
+ return 0; /* The areas we want are already open */
+
+ for (i = 0; i < 16; i++)
+ if ((1 << i) & newareas)
+ if (prepare_high_zone_for_htlb(mm, i) != 0)
+ return -EBUSY;
+
+ mm->context.high_htlb_areas |= newareas;
+
+ /* update the paca copy of the context struct */
+ get_paca()->context = mm->context;
+
+ /* the context change must make it to memory before the flush,
+ * so that further SLB misses do the right thing. */
+ mb();
+ on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
return 0;
}
int prepare_hugepage_range(unsigned long addr, unsigned long len)
{
- if (within_hugepage_high_range(addr, len))
- return 0;
- else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
- int err;
- /* Yes, we need both tests, in case addr+len overflows
- * 64-bit arithmetic */
- err = open_low_hpage_segs(current->mm,
+ int err;
+
+ if ( (addr+len) < addr )
+ return -EINVAL;
+
+ if ((addr + len) < 0x100000000UL)
+ err = open_low_hpage_areas(current->mm,
LOW_ESID_MASK(addr, len));
- if (err)
- printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
- " failed (segs: 0x%04hx)\n", addr, len,
- LOW_ESID_MASK(addr, len));
+ else
+ err = open_high_hpage_areas(current->mm,
+ HTLB_AREA_MASK(addr, len));
+ if (err) {
+ printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+ " failed (areas: 0x%04hx, areas: 0x%04hx)\n", addr, len,
+ LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
return err;
}
- return -EINVAL;
+ return 0;
}
struct page *
@@ -273,8 +336,8 @@
vma = find_vma(mm, addr);
continue;
}
- if (touches_hugepage_high_range(addr, len)) {
- addr = TASK_HPAGE_END;
+ if (touches_hugepage_high_range(mm, addr, len)) {
+ addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
vma = find_vma(mm, addr);
continue;
}
@@ -345,8 +408,9 @@
if (touches_hugepage_low_range(mm, addr, len)) {
addr = (addr & ((~0) << SID_SHIFT)) - len;
goto hugepage_recheck;
- } else if (touches_hugepage_high_range(addr, len)) {
- addr = TASK_HPAGE_BASE - len;
+ } else if (touches_hugepage_high_range(mm, addr, len)) {
+ addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+ goto hugepage_recheck;
}
/*
@@ -426,23 +490,28 @@
return -ENOMEM;
}
-static unsigned long htlb_get_high_area(unsigned long len)
+static unsigned long htlb_get_high_area(unsigned long len, u16 zonemask)
{
- unsigned long addr = TASK_HPAGE_BASE;
+ unsigned long addr = 0x100000000UL;
struct vm_area_struct *vma;
vma = find_vma(current->mm, addr);
- for (vma = find_vma(current->mm, addr);
- addr + len <= TASK_HPAGE_END;
- vma = vma->vm_next) {
+ while (addr + len <= TASK_SIZE_USER64) {
BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
- BUG_ON(! within_hugepage_high_range(addr, len));
+
+ if (! __within_hugepage_high_range(addr, len, zonemask)) {
+ addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+ vma = find_vma(current->mm, addr);
+ continue;
+ }
if (!vma || (addr + len) <= vma->vm_start)
return addr;
addr = ALIGN(vma->vm_end, HPAGE_SIZE);
- /* Because we're in a hugepage region, this alignment
- * should not skip us over any VMAs */
+ /* Depending on segmask this might not be a confirmed
+ * hugepage region, so the ALIGN could have skipped
+ * some VMAs */
+ vma = find_vma(current->mm, addr);
}
return -ENOMEM;
@@ -460,11 +529,11 @@
if (test_thread_flag(TIF_32BIT)) {
int lastshift = 0;
- u16 segmask, cursegs = current->mm->context.htlb_segs;
+ u16 segmask, curareas = current->mm->context.low_htlb_areas;
/* First see if we can do the mapping in the existing
* low hpage segments */
- addr = htlb_get_low_area(len, cursegs);
+ addr = htlb_get_low_area(len, curareas);
if (addr != -ENOMEM)
return addr;
@@ -473,16 +542,37 @@
if (segmask & 1)
lastshift = 1;
- addr = htlb_get_low_area(len, cursegs | segmask);
+ addr = htlb_get_low_area(len, curareas | segmask);
if ((addr != -ENOMEM)
- && open_low_hpage_segs(current->mm, segmask) == 0)
+ && open_low_hpage_areas(current->mm, segmask) == 0)
return addr;
}
printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
" enough segments\n");
return -ENOMEM;
} else {
- return htlb_get_high_area(len);
+ int lastshift = 0;
+ u16 zonemask, curareas = current->mm->context.high_htlb_areas;
+
+ /* First see if we can do the mapping in the existing
+ * low hpage segments */
+ addr = htlb_get_high_area(len, curareas);
+ if (addr != -ENOMEM)
+ return addr;
+
+ for (zonemask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+ ! lastshift; zonemask >>=1) {
+ if (zonemask & 1)
+ lastshift = 1;
+
+ addr = htlb_get_high_area(len, curareas | zonemask);
+ if ((addr != -ENOMEM)
+ && open_high_hpage_areas(current->mm, zonemask) == 0)
+ return addr;
+ }
+ printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+ " enough segments\n");
+ return -ENOMEM;
}
}
Index: working-2.6/include/asm-ppc64/mmu.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/mmu.h 2005-06-07 13:26:15.000000000 +1000
+++ working-2.6/include/asm-ppc64/mmu.h 2005-06-07 13:32:57.000000000 +1000
@@ -304,7 +304,7 @@
typedef struct {
mm_context_id_t id;
#ifdef CONFIG_HUGETLB_PAGE
- u16 htlb_segs; /* bitmask */
+ u16 low_htlb_areas, high_htlb_areas;
#endif
} mm_context_t;
Index: working-2.6/include/asm-ppc64/page.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/page.h 2005-06-07 13:26:15.000000000 +1000
+++ working-2.6/include/asm-ppc64/page.h 2005-06-07 13:38:32.000000000 +1000
@@ -37,40 +37,45 @@
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
-/* For 64-bit processes the hugepage range is 1T-1.5T */
-#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000)
-#define TASK_HPAGE_END ASM_CONST(0x0000018000000000)
+#define HTLB_AREA_SHIFT 40
+#define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT)
+#define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT)
#define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \
- (1U << GET_ESID(addr))) & 0xffff)
+#define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
+ - (1U << GET_HTLB_AREA(addr))) & 0xffff)
#define ARCH_HAS_HUGEPAGE_ONLY_RANGE
#define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
#define ARCH_HAS_SETCLEAR_HUGE_PTE
#define touches_hugepage_low_range(mm, addr, len) \
- (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
-#define touches_hugepage_high_range(addr, len) \
- (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
+ (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas)
+#define touches_hugepage_high_range(mm, addr, len) \
+ (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas)
#define __within_hugepage_low_range(addr, len, segmask) \
((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask))
#define within_hugepage_low_range(addr, len) \
__within_hugepage_low_range((addr), (len), \
- current->mm->context.htlb_segs)
-#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
- && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
+ current->mm->context.low_htlb_areas)
+#define __within_hugepage_high_range(addr, len, zonemask) \
+ ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask))
+#define within_hugepage_high_range(addr, len) \
+ __within_hugepage_high_range((addr), (len), \
+ current->mm->context.high_htlb_areas)
#define is_hugepage_only_range(mm, addr, len) \
- (touches_hugepage_high_range((addr), (len)) || \
+ (touches_hugepage_high_range((mm), (addr), (len)) || \
touches_hugepage_low_range((mm), (addr), (len)))
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#define in_hugepage_area(context, addr) \
(cpu_has_feature(CPU_FTR_16M_PAGE) && \
- ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
+ ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \
( ((addr) < 0x100000000L) && \
- ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) )
+ ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) )
#else /* !CONFIG_HUGETLB_PAGE */
Index: working-2.6/arch/ppc64/mm/slb_low.S
===================================================================
--- working-2.6.orig/arch/ppc64/mm/slb_low.S 2005-06-07 13:26:15.000000000 +1000
+++ working-2.6/arch/ppc64/mm/slb_low.S 2005-06-07 14:54:11.000000000 +1000
@@ -89,30 +89,34 @@
b 9f
0: /* user address: proto-VSID = context<<15 | ESID */
- li r11,SLB_VSID_USER
-
srdi. r9,r3,USER_ESID_BITS
bne- 8f /* invalid ea bits set */
#ifdef CONFIG_HUGETLB_PAGE
BEGIN_FTR_SECTION
- /* check against the hugepage ranges */
- cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT)
- bge 6f /* >= TASK_HPAGE_END */
- cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT)
- bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */
+ lhz r9,PACAHIGHHTLBAREAS(r13)
+ srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+ srd r9,r9,r11
+ andi. r9,r9,1
+ bne 5f
+
+ li r11,SLB_VSID_USER
cmpldi r3,16
- bge 6f /* 4GB..TASK_HPAGE_BASE */
+ blt 6f /* if we're below 4GB, we're done */
- lhz r9,PACAHTLBSEGS(r13)
+ lhz r9,PACALOWHTLBAREAS(r13)
srd r9,r9,r3
andi. r9,r9,1
- beq 6f
+ bne 5f
+
+ b 6f
5: /* this is a hugepage user address */
li r11,(SLB_VSID_USER|SLB_VSID_L)
END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
+#else /* CONFIG_HUGETLB_PAGE */
+ li r11,SLB_VSID_USER
+#endif
6: ld r9,PACACONTEXTID(r13)
rldimi r3,r9,USER_ESID_BITS,0
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson
More information about the Linuxppc64-dev
mailing list