[RFC] implicit hugetlb pages (hugetlb_implicit)

Adam Litke agl at us.ibm.com
Sat Jan 10 08:27:20 EST 2004


hugetlb_implicit (2.6.0):
   This patch includes the anonymous mmap work from Dave Gibson (right?)
as well as my shared mem support.  I have added safe fallback for
implicit allocations.  This patch uses a fixed address space range of
80000000 - c0000000 for huge pages.

-- snip --

diff -purN linux-2.6.0/fs/hugetlbfs/inode.c linux-2.6.0-implicit/fs/hugetlbfs/inode.c
--- linux-2.6.0/fs/hugetlbfs/inode.c	2003-12-17 18:59:36.000000000 -0800
+++ linux-2.6.0-implicit/fs/hugetlbfs/inode.c	2004-01-08 16:19:31.000000000 -0800
@@ -26,12 +26,17 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/err.h>

 #include <asm/uaccess.h>
+#include <asm/mman.h>

 /* some random number */
 #define HUGETLBFS_MAGIC	0x958458f6

+extern int mmap_use_hugepages;
+extern int mmap_hugepages_map_sz;
+
 static struct super_operations hugetlbfs_ops;
 static struct address_space_operations hugetlbfs_aops;
 struct file_operations hugetlbfs_file_operations;
@@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 #else
-static unsigned long
+unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -115,6 +120,65 @@ hugetlb_get_unmapped_area(struct file *f
 }
 #endif

+int mmap_hugetlb_implicit(unsigned long len)
+{
+	/* Are we enabled? */
+	if (!mmap_use_hugepages)
+		return 0;
+	/* Must be HPAGE aligned */
+	if (len & ~HPAGE_MASK)
+		return 0;
+	/* Are we under the minimum size? */
+	if (mmap_hugepages_map_sz
+	    && len < (mmap_hugepages_map_sz << 20))
+		return 0;
+	/* Do we have enough free huge pages? */
+	if (!is_hugepage_mem_enough(len))
+		return 0;
+
+	return 1;
+}
+
+unsigned long
+try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long *flags)
+{
+	long pre_error = 0;
+
+	/* Check some prerequisites */
+	if (!capable(CAP_IPC_LOCK))
+		pre_error = -EPERM;
+	else if (file)
+		pre_error = -EINVAL;
+
+	/* Explicit requests for huge pages are allowed to return errors */
+	if (*flags & MAP_HUGETLB) {
+		if (pre_error)
+			return pre_error;
+		return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+	}
+
+	/*
+	 * When implicit request fails, return 0 so we can
+	 * retry later with regular pages.
+	 */
+	if (mmap_hugetlb_implicit(len)) {
+		if (pre_error)
+			goto out;
+		addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+		if (IS_ERR((void *)addr))
+			goto out;
+		else {
+			*flags |= MAP_HUGETLB;
+			return addr;
+		}
+	}
+
+out:
+	*flags &= ~MAP_HUGETLB;
+	return 0;
+}
+
 /*
  * Read a page. Again trivial. If it didn't already exist
  * in the page cache, it is zero-filled.
diff -purN linux-2.6.0/include/asm-i386/mman.h linux-2.6.0-implicit/include/asm-i386/mman.h
--- linux-2.6.0/include/asm-i386/mman.h	2003-12-17 18:58:15.000000000 -0800
+++ linux-2.6.0-implicit/include/asm-i386/mman.h	2004-01-08 16:19:31.000000000 -0800
@@ -11,6 +11,11 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#ifdef CONFIG_HUGETLB_PAGE
+#define MAP_HUGETLB	0x04		/* Use huge pages */
+#else
+#define MAP_HUGETLB	0x00
+#endif
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
diff -purN linux-2.6.0/include/asm-ppc64/mman.h linux-2.6.0-implicit/include/asm-ppc64/mman.h
--- linux-2.6.0/include/asm-ppc64/mman.h	2003-12-17 18:58:47.000000000 -0800
+++ linux-2.6.0-implicit/include/asm-ppc64/mman.h	2004-01-08 16:19:31.000000000 -0800
@@ -18,6 +18,11 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#ifdef CONFIG_HUGETLB_PAGE
+#define MAP_HUGETLB	0x04
+#else
+#define MAP_HUGETLB	0x0
+#endif
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
diff -purN linux-2.6.0/include/linux/hugetlb.h linux-2.6.0-implicit/include/linux/hugetlb.h
--- linux-2.6.0/include/linux/hugetlb.h	2003-12-17 18:58:49.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/hugetlb.h	2004-01-08 16:19:31.000000000 -0800
@@ -118,4 +118,9 @@ static inline void set_file_hugepages(st

 #endif /* !CONFIG_HUGETLBFS */

+unsigned long
+hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long,
+								unsigned long, unsigned long);
+
+
 #endif /* _LINUX_HUGETLB_H */
diff -purN linux-2.6.0/include/linux/mman.h linux-2.6.0-implicit/include/linux/mman.h
--- linux-2.6.0/include/linux/mman.h	2003-12-17 18:58:15.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/mman.h	2004-01-08 16:19:31.000000000 -0800
@@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags)
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+#ifdef CONFIG_HUGETLB_PAGE
+               _calc_vm_trans(flags, MAP_HUGETLB,    VM_HUGETLB   ) |
+#endif
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }

diff -purN linux-2.6.0/include/linux/sysctl.h linux-2.6.0-implicit/include/linux/sysctl.h
--- linux-2.6.0/include/linux/sysctl.h	2003-12-17 18:58:56.000000000 -0800
+++ linux-2.6.0-implicit/include/linux/sysctl.h	2004-01-08 16:19:31.000000000 -0800
@@ -127,6 +127,10 @@ enum
 	KERN_PANIC_ON_OOPS=57,  /* int: whether we will panic on an oops */
 	KERN_HPPA_PWRSW=58,	/* int: hppa soft-power enable */
 	KERN_HPPA_UNALIGNED=59,	/* int: hppa unaligned-trap enable */
+	KERN_SHMUSEHUGEPAGES=60,	/* int: back shm with huge pages */
+	KERN_MMAPUSEHUGEPAGES=61,	/* int: back anon mmap with huge pages */
+	KERN_HPAGES_PER_FILE=62,	/* int: max bigpages per file */
+	KERN_HPAGES_MAP_SZ=63,		/* int: min size (MB) of mapping */
 };


diff -purN linux-2.6.0/ipc/shm.c linux-2.6.0-implicit/ipc/shm.c
--- linux-2.6.0/ipc/shm.c	2003-12-17 18:58:49.000000000 -0800
+++ linux-2.6.0-implicit/ipc/shm.c	2004-01-08 16:19:31.000000000 -0800
@@ -32,6 +32,9 @@

 #define shm_flags	shm_perm.mode

+extern int shm_use_hugepages;
+extern int shm_hugepages_per_file;
+
 static struct file_operations shm_file_operations;
 static struct vm_operations_struct shm_vm_ops;

@@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v
 	.nopage	= shmem_nopage,
 };

+#ifdef CONFIG_HUGETLBFS
+int shm_with_hugepages(int shmflag, size_t size)
+{
+	/* flag specified explicitly */
+	if (shmflag & SHM_HUGETLB)
+		return 1;
+	/* Are we disabled? */
+	if (!shm_use_hugepages)
+		return 0;
+	/* Must be HPAGE aligned */
+	if (size & ~HPAGE_MASK)
+		return 0;
+	/* Are we under the max per file? */
+	if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file)
+		return 0;
+	/* Do we have enough free huge pages? */
+	if (!is_hugepage_mem_enough(size))
+		return 0;
+
+	return 1;
+}
+#else
+int shm_with_hugepages(int shmflag, size_t size) { return 0; }
+#endif
+
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	int error;
@@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg
 		return error;
 	}

-	if (shmflg & SHM_HUGETLB)
+	if (shm_with_hugepages(shmflg, size)) {
+		shmflg |= SHM_HUGETLB;
 		file = hugetlb_zero_setup(size);
+	}
 	else {
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, VM_ACCOUNT);
diff -purN linux-2.6.0/kernel/sysctl.c linux-2.6.0-implicit/kernel/sysctl.c
--- linux-2.6.0/kernel/sysctl.c	2003-12-17 18:58:08.000000000 -0800
+++ linux-2.6.0-implicit/kernel/sysctl.c	2004-01-08 16:19:31.000000000 -0800
@@ -60,6 +60,8 @@ extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
+extern int shm_use_hugepages, shm_hugepages_per_file;
+extern int mmap_use_hugepages, mmap_hugepages_map_sz;

 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -579,6 +581,40 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_HUGETLBFS
+	{
+		.ctl_name	= KERN_SHMUSEHUGEPAGES,
+		.procname	= "shm-use-hugepages",
+		.data		= &shm_use_hugepages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MMAPUSEHUGEPAGES,
+		.procname	= "mmap-use-hugepages",
+		.data		= &mmap_use_hugepages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPAGES_PER_FILE,
+		.procname	= "shm-hugepages-per-file",
+		.data		= &shm_hugepages_per_file,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPAGES_MAP_SZ,
+		.procname	= "mmap-hugepages-min-mapping",
+		.data		= &mmap_hugepages_map_sz,
+		.maxlen		= sizeof(int),
+		.mode		0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };

diff -purN linux-2.6.0/mm/mmap.c linux-2.6.0-implicit/mm/mmap.c
--- linux-2.6.0/mm/mmap.c	2003-12-17 18:58:58.000000000 -0800
+++ linux-2.6.0-implicit/mm/mmap.c	2004-01-08 16:20:10.000000000 -0800
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/err.h>

 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -59,6 +60,9 @@ EXPORT_SYMBOL(sysctl_overcommit_memory);
 EXPORT_SYMBOL(sysctl_overcommit_ratio);
 EXPORT_SYMBOL(vm_committed_space);

+int mmap_use_hugepages = 0;
+int mmap_hugepages_map_sz = 256;
+
 /*
  * Requires inode->i_mapping->i_shared_sem
  */
@@ -473,7 +477,7 @@ unsigned long do_mmap_pgoff(struct file
 	int correct_wcount = 0;
 	int error;
 	struct rb_node ** rb_link, * rb_parent;
-	unsigned long charged = 0;
+	unsigned long charged = 0, addr_save = addr;

 	if (file) {
 		if (!file->f_op || !file->f_op->mmap)
@@ -501,8 +505,17 @@ unsigned long do_mmap_pgoff(struct file

 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
+	 * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is
+	 * unset.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+#ifdef CONFIG_HUGETLBFS
+	addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags);
+	if (IS_ERR((void *)addr))
+		return addr;
+	else if (addr == 0)
+#endif
+		addr = get_unmapped_area(file, addr_save, len, pgoff, flags);
+
 	if (addr & ~PAGE_MASK)
 		return addr;

@@ -566,6 +579,9 @@ unsigned long do_mmap_pgoff(struct file
 		default:
 			return -EINVAL;
 		case MAP_PRIVATE:
+#ifdef CONFIG_HUGETLBFS
+		case (MAP_PRIVATE|MAP_HUGETLB):
+#endif
 			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
 			/* fall through */
 		case MAP_SHARED:
@@ -650,10 +666,31 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
-	} else if (vm_flags & VM_SHARED) {
-		error = shmem_zero_setup(vma);
-		if (error)
-			goto free_vma;
+	} else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) {
+		if (!is_vm_hugetlb_page(vma)) {
+			error = shmem_zero_setup(vma);
+			if (error)
+				goto free_vma;
+		} else {
+			/*
+			 * Presumably hugetlb_zero_setup() acquires a
+			 * reference count for us. The difference
+			 * between this and the shmem_zero_setup()
+			 * case is that we can encounter an error
+			 * _after_ allocating the file. The error
+			 * path was adjusted slightly to fput() for us.
+			 */
+			struct file *new_file = hugetlb_zero_setup(len);
+			if (IS_ERR(new_file)) {
+				error = PTR_ERR(new_file);
+				goto free_vma;
+			} else {
+				vma->vm_file = new_file;
+				error = new_file->f_op->mmap(new_file, vma);
+				if (error)
+					goto unmap_and_free_vma;
+			}
+		}
 	}

 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
@@ -701,11 +738,21 @@ out:
 unmap_and_free_vma:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
-	vma->vm_file = NULL;
-	fput(file);

-	/* Undo any partial mapping done by a device driver. */
+	/*
+	 * Undo any partial mapping done by a device driver.
+	 * hugetlb wants to know the vma's file etc. so nuke
+	 * the file afterward.
+	 */
 	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+
+	/*
+	 * vma->vm_file may be different from file in the hugetlb case.
+	 */
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = NULL;
+
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
diff -purN linux-2.6.0/mm/shmem.c linux-2.6.0-implicit/mm/shmem.c
--- linux-2.6.0/mm/shmem.c	2003-12-17 18:58:48.000000000 -0800
+++ linux-2.6.0-implicit/mm/shmem.c	2004-01-08 16:19:31.000000000 -0800
@@ -40,6 +40,29 @@
 #include <asm/uaccess.h>
 #include <asm/div64.h>

+int shm_use_hugepages;
+
+/*
+ * On 64bit archs the vmalloc area is very large,
+ * so we allocate the array in vmalloc on 64bit archs.
+ *
+ * Assuming 2M pages (x86 and x86-64) those default setting
+ * will allow up to 128G of bigpages in a single file on
+ * 64bit archs and 64G on 32bit archs using the max
+ * kmalloc size of 128k. So tweaking in practice is needed
+ * only to go past 128G of bigpages per file on 64bit archs.
+ *
+ * This sysctl is in page units (each page large BIGPAGE_SIZE).
+ */
+#ifdef CONFIG_HUGETLBFS
+#if BITS_PER_LONG == 64
+int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT);
+#else
+int shm_hugepages_per_file = 131072 / sizeof(struct page *);
+#endif
+#endif
+
+
 /* This magic number is used in glibc for posix shared memory */
 #define TMPFS_MAGIC	0x01021994

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/





More information about the Linuxppc64-dev mailing list