[RFC] implicit hugetlb pages (hugetlb_implicit)
Adam Litke
agl at us.ibm.com
Sat Jan 10 08:27:20 EST 2004
hugetlb_implicit (2.6.0):
This patch includes the anonymous mmap work from Dave Gibson (right?)
as well as my shared mem support. I have added safe fallback for
implicit allocations. This patch uses a fixed address space range of
80000000 - c0000000 for huge pages.
-- snip --
diff -purN linux-2.6.0/fs/hugetlbfs/inode.c linux-2.6.0-implicit/fs/hugetlbfs/inode.c
--- linux-2.6.0/fs/hugetlbfs/inode.c 2003-12-17 18:59:36.000000000 -0800
+++ linux-2.6.0-implicit/fs/hugetlbfs/inode.c 2004-01-08 16:19:31.000000000 -0800
@@ -26,12 +26,17 @@
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
+#include <linux/err.h>
#include <asm/uaccess.h>
+#include <asm/mman.h>
/* some random number */
#define HUGETLBFS_MAGIC 0x958458f6
+extern int mmap_use_hugepages;
+extern int mmap_hugepages_map_sz;
+
static struct super_operations hugetlbfs_ops;
static struct address_space_operations hugetlbfs_aops;
struct file_operations hugetlbfs_file_operations;
@@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
#else
-static unsigned long
+unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
@@ -115,6 +120,65 @@ hugetlb_get_unmapped_area(struct file *f
}
#endif
+int mmap_hugetlb_implicit(unsigned long len)
+{
+ /* Are we enabled? */
+ if (!mmap_use_hugepages)
+ return 0;
+ /* Must be HPAGE aligned */
+ if (len & ~HPAGE_MASK)
+ return 0;
+ /* Are we under the minimum size? */
+ if (mmap_hugepages_map_sz
+ && len < (mmap_hugepages_map_sz << 20))
+ return 0;
+ /* Do we have enough free huge pages? */
+ if (!is_hugepage_mem_enough(len))
+ return 0;
+
+ return 1;
+}
+
+unsigned long
+try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long *flags)
+{
+ long pre_error = 0;
+
+ /* Check some prerequisites */
+ if (!capable(CAP_IPC_LOCK))
+ pre_error = -EPERM;
+ else if (file)
+ pre_error = -EINVAL;
+
+ /* Explicit requests for huge pages are allowed to return errors */
+ if (*flags & MAP_HUGETLB) {
+ if (pre_error)
+ return pre_error;
+ return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+ }
+
+ /*
+ * When implicit request fails, return 0 so we can
+ * retry later with regular pages.
+ */
+ if (mmap_hugetlb_implicit(len)) {
+ if (pre_error)
+ goto out;
+ addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+ if (IS_ERR((void *)addr))
+ goto out;
+ else {
+ *flags |= MAP_HUGETLB;
+ return addr;
+ }
+ }
+
+out:
+ *flags &= ~MAP_HUGETLB;
+ return 0;
+}
+
/*
* Read a page. Again trivial. If it didn't already exist
* in the page cache, it is zero-filled.
diff -purN linux-2.6.0/include/asm-i386/mman.h linux-2.6.0-implicit/include/asm-i386/mman.h
--- linux-2.6.0/include/asm-i386/mman.h 2003-12-17 18:58:15.000000000 -0800
+++ linux-2.6.0-implicit/include/asm-i386/mman.h 2004-01-08 16:19:31.000000000 -0800
@@ -11,6 +11,11 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
+#ifdef CONFIG_HUGETLB_PAGE
+#define MAP_HUGETLB 0x04 /* Use huge pages */
+#else
+#define MAP_HUGETLB 0x00
+#endif
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
diff -purN linux-2.6.0/include/asm-ppc64/mman.h linux-2.6.0-implicit/include/asm-ppc64/mman.h
--- linux-2.6.0/include/asm-ppc64/mman.h 2003-12-17 18:58:47.000000000 -0800
+++ linux-2.6.0-implicit/include/asm-ppc64/mman.h 2004-01-08 16:19:31.000000000 -0800
@@ -18,6 +18,11 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
+#ifdef CONFIG_HUGETLB_PAGE
+#define MAP_HUGETLB 0x04
+#else
+#define MAP_HUGETLB 0x0
+#endif
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
diff -purN linux-2.6.0/include/linux/hugetlb.h linux-2.6.0-implicit/include/linux/hugetlb.h
--- linux-2.6.0/include/linux/hugetlb.h 2003-12-17 18:58:49.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/hugetlb.h 2004-01-08 16:19:31.000000000 -0800
@@ -118,4 +118,9 @@ static inline void set_file_hugepages(st
#endif /* !CONFIG_HUGETLBFS */
+unsigned long
+hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+
#endif /* _LINUX_HUGETLB_H */
diff -purN linux-2.6.0/include/linux/mman.h linux-2.6.0-implicit/include/linux/mman.h
--- linux-2.6.0/include/linux/mman.h 2003-12-17 18:58:15.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/mman.h 2004-01-08 16:19:31.000000000 -0800
@@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags)
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+#ifdef CONFIG_HUGETLB_PAGE
+ _calc_vm_trans(flags, MAP_HUGETLB, VM_HUGETLB ) |
+#endif
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}
diff -purN linux-2.6.0/include/linux/sysctl.h linux-2.6.0-implicit/include/linux/sysctl.h
--- linux-2.6.0/include/linux/sysctl.h 2003-12-17 18:58:56.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/sysctl.h 2004-01-08 16:19:31.000000000 -0800
@@ -127,6 +127,10 @@ enum
KERN_PANIC_ON_OOPS=57, /* int: whether we will panic on an oops */
KERN_HPPA_PWRSW=58, /* int: hppa soft-power enable */
KERN_HPPA_UNALIGNED=59, /* int: hppa unaligned-trap enable */
+ KERN_SHMUSEHUGEPAGES=60, /* int: back shm with huge pages */
+ KERN_MMAPUSEHUGEPAGES=61, /* int: back anon mmap with huge pages */
+ KERN_HPAGES_PER_FILE=62, /* int: max bigpages per file */
+ KERN_HPAGES_MAP_SZ=63, /* int: min size (MB) of mapping */
};
diff -purN linux-2.6.0/ipc/shm.c linux-2.6.0-implicit/ipc/shm.c
--- linux-2.6.0/ipc/shm.c 2003-12-17 18:58:49.000000000 -0800
+++ linux-2.6.0-implicit/ipc/shm.c 2004-01-08 16:19:31.000000000 -0800
@@ -32,6 +32,9 @@
#define shm_flags shm_perm.mode
+extern int shm_use_hugepages;
+extern int shm_hugepages_per_file;
+
static struct file_operations shm_file_operations;
static struct vm_operations_struct shm_vm_ops;
@@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v
.nopage = shmem_nopage,
};
+#ifdef CONFIG_HUGETLBFS
+int shm_with_hugepages(int shmflag, size_t size)
+{
+ /* flag specified explicitly */
+ if (shmflag & SHM_HUGETLB)
+ return 1;
+ /* Are we disabled? */
+ if (!shm_use_hugepages)
+ return 0;
+ /* Must be HPAGE aligned */
+ if (size & ~HPAGE_MASK)
+ return 0;
+ /* Are we under the max per file? */
+ if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file)
+ return 0;
+ /* Do we have enough free huge pages? */
+ if (!is_hugepage_mem_enough(size))
+ return 0;
+
+ return 1;
+}
+#else
+int shm_with_hugepages(int shmflag, size_t size) { return 0; }
+#endif
+
static int newseg (key_t key, int shmflg, size_t size)
{
int error;
@@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg
return error;
}
- if (shmflg & SHM_HUGETLB)
+ if (shm_with_hugepages(shmflg, size)) {
+ shmflg |= SHM_HUGETLB;
file = hugetlb_zero_setup(size);
+ }
else {
sprintf (name, "SYSV%08x", key);
file = shmem_file_setup(name, size, VM_ACCOUNT);
diff -purN linux-2.6.0/kernel/sysctl.c linux-2.6.0-implicit/kernel/sysctl.c
--- linux-2.6.0/kernel/sysctl.c 2003-12-17 18:58:08.000000000 -0800
+++ linux-2.6.0-implicit/kernel/sysctl.c 2004-01-08 16:19:31.000000000 -0800
@@ -60,6 +60,8 @@ extern int cad_pid;
extern int pid_max;
extern int sysctl_lower_zone_protection;
extern int min_free_kbytes;
+extern int shm_use_hugepages, shm_hugepages_per_file;
+extern int mmap_use_hugepages, mmap_hugepages_map_sz;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -579,6 +581,40 @@ static ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifdef CONFIG_HUGETLBFS
+ {
+ .ctl_name = KERN_SHMUSEHUGEPAGES,
+ .procname = "shm-use-hugepages",
+ .data = &shm_use_hugepages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_MMAPUSEHUGEPAGES,
+ .procname = "mmap-use-hugepages",
+ .data = &mmap_use_hugepages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_HPAGES_PER_FILE,
+ .procname = "shm-hugepages-per-file",
+ .data = &shm_hugepages_per_file,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_HPAGES_MAP_SZ,
+ .procname = "mmap-hugepages-min-mapping",
+ .data = &mmap_hugepages_map_sz,
+ .maxlen = sizeof(int),
+ .mode 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{ .ctl_name = 0 }
};
diff -purN linux-2.6.0/mm/mmap.c linux-2.6.0-implicit/mm/mmap.c
--- linux-2.6.0/mm/mmap.c 2003-12-17 18:58:58.000000000 -0800
+++ linux-2.6.0-implicit/mm/mmap.c 2004-01-08 16:20:10.000000000 -0800
@@ -20,6 +20,7 @@
#include <linux/profile.h>
#include <linux/module.h>
#include <linux/mount.h>
+#include <linux/err.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
@@ -59,6 +60,9 @@ EXPORT_SYMBOL(sysctl_overcommit_memory);
EXPORT_SYMBOL(sysctl_overcommit_ratio);
EXPORT_SYMBOL(vm_committed_space);
+int mmap_use_hugepages = 0;
+int mmap_hugepages_map_sz = 256;
+
/*
* Requires inode->i_mapping->i_shared_sem
*/
@@ -473,7 +477,7 @@ unsigned long do_mmap_pgoff(struct file
int correct_wcount = 0;
int error;
struct rb_node ** rb_link, * rb_parent;
- unsigned long charged = 0;
+ unsigned long charged = 0, addr_save = addr;
if (file) {
if (!file->f_op || !file->f_op->mmap)
@@ -501,8 +505,17 @@ unsigned long do_mmap_pgoff(struct file
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
+ * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is
+ * unset.
*/
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
+#ifdef CONFIG_HUGETLBFS
+ addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags);
+ if (IS_ERR((void *)addr))
+ return addr;
+ else if (addr == 0)
+#endif
+ addr = get_unmapped_area(file, addr_save, len, pgoff, flags);
+
if (addr & ~PAGE_MASK)
return addr;
@@ -566,6 +579,9 @@ unsigned long do_mmap_pgoff(struct file
default:
return -EINVAL;
case MAP_PRIVATE:
+#ifdef CONFIG_HUGETLBFS
+ case (MAP_PRIVATE|MAP_HUGETLB):
+#endif
vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
/* fall through */
case MAP_SHARED:
@@ -650,10 +666,31 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
+ } else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) {
+ if (!is_vm_hugetlb_page(vma)) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ /*
+ * Presumably hugetlb_zero_setup() acquires a
+ * reference count for us. The difference
+ * between this and the shmem_zero_setup()
+ * case is that we can encounter an error
+ * _after_ allocating the file. The error
+ * path was adjusted slightly to fput() for us.
+ */
+ struct file *new_file = hugetlb_zero_setup(len);
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto free_vma;
+ } else {
+ vma->vm_file = new_file;
+ error = new_file->f_op->mmap(new_file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+ }
+ }
}
/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
@@ -701,11 +738,21 @@ out:
unmap_and_free_vma:
if (correct_wcount)
atomic_inc(&inode->i_writecount);
- vma->vm_file = NULL;
- fput(file);
- /* Undo any partial mapping done by a device driver. */
+ /*
+ * Undo any partial mapping done by a device driver.
+ * hugetlb wants to know the vma's file etc. so nuke
+ * the file afterward.
+ */
zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+
+ /*
+ * vma->vm_file may be different from file in the hugetlb case.
+ */
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
diff -purN linux-2.6.0/mm/shmem.c linux-2.6.0-implicit/mm/shmem.c
--- linux-2.6.0/mm/shmem.c 2003-12-17 18:58:48.000000000 -0800
+++ linux-2.6.0-implicit/mm/shmem.c 2004-01-08 16:19:31.000000000 -0800
@@ -40,6 +40,29 @@
#include <asm/uaccess.h>
#include <asm/div64.h>
+int shm_use_hugepages;
+
+/*
+ * On 64bit archs the vmalloc area is very large,
+ * so we allocate the array in vmalloc on 64bit archs.
+ *
+ * Assuming 2M pages (x86 and x86-64) those default setting
+ * will allow up to 128G of bigpages in a single file on
+ * 64bit archs and 64G on 32bit archs using the max
+ * kmalloc size of 128k. So tweaking in practice is needed
+ * only to go past 128G of bigpages per file on 64bit archs.
+ *
+ * This sysctl is in page units (each page large BIGPAGE_SIZE).
+ */
+#ifdef CONFIG_HUGETLBFS
+#if BITS_PER_LONG == 64
+int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT);
+#else
+int shm_hugepages_per_file = 131072 / sizeof(struct page *);
+#endif
+#endif
+
+
/* This magic number is used in glibc for posix shared memory */
#define TMPFS_MAGIC 0x01021994
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/
More information about the Linuxppc64-dev
mailing list