[RFC] implicit hugetlb pages (hugetlb_implicit)

Mon Jan 12 15:19:18 EST 2004

On Fri, Jan 09, 2004 at 01:27:20PM -0800, Adam Litke wrote:
>
> hugetlb_implicit (2.6.0):
>    This patch includes the anonymous mmap work from Dave Gibson
> (right?)

I'm not sure what you're referring to here.  My patches for lbss
support also include support for copy-on-write of hugepages and
various other changes which can make them act kind of like anonymous
pages.

But I don't see much in this patch that looks familiar.

> as well as my shared mem support.  I have added safe fallback for
> implicit allocations.  This patch uses a fixed address space range of
> 80000000 - c0000000 for huge pages.

Some detailed comments below:

> diff -purN linux-2.6.0/fs/hugetlbfs/inode.c linux-2.6.0-implicit/fs/hugetlbfs/inode.c
> +++ linux-2.6.0-implicit/fs/hugetlbfs/inode.c	2004-01-08 16:19:31.000000000 -0800
> @@ -26,12 +26,17 @@
>  #include <linux/dnotify.h>
>  #include <linux/statfs.h>
>  #include <linux/security.h>
> +#include <linux/err.h>
>
>  #include <asm/uaccess.h>
> +#include <asm/mman.h>
>
>  /* some random number */
>  #define HUGETLBFS_MAGIC	0x958458f6
>
> +extern int mmap_use_hugepages;
> +extern int mmap_hugepages_map_sz;
> +
>  static struct super_operations hugetlbfs_ops;
>  static struct address_space_operations hugetlbfs_aops;
>  struct file_operations hugetlbfs_file_operations;
> @@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi
>  unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>  		unsigned long len, unsigned long pgoff, unsigned long flags);
>  #else
> -static unsigned long
> +unsigned long
>  hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>  		unsigned long len, unsigned long pgoff, unsigned long flags)
>  {
> @@ -115,6 +120,65 @@ hugetlb_get_unmapped_area(struct file *f
>  }
>  #endif
>
> +int mmap_hugetlb_implicit(unsigned long len)
> +{
> +	/* Are we enabled? */
> +	if (!mmap_use_hugepages)
> +		return 0;
> +	/* Must be HPAGE aligned */
> +	if (len & ~HPAGE_MASK)
> +		return 0;
> +	/* Are we under the minimum size? */
> +	if (mmap_hugepages_map_sz
> +	    && len < (mmap_hugepages_map_sz << 20))
> +		return 0;
> +	/* Do we have enough free huge pages? */
> +	if (!is_hugepage_mem_enough(len))
> +		return 0;

Is this test safe/necessary?  i.e. a) is there any potential race
which could cause the mmap() to fail because it's short of memory
despite suceeding the test here and b) can't we just let the mmap fail
and fall back then rather than checking beforehand?

Do we need/want any consideration of the given "hint" address here?

> +	return 1;
> +}
> +
> +unsigned long
> +try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> +		unsigned long len, unsigned long pgoff, unsigned long *flags)
> +{
> +	long pre_error = 0;
> +
> +	/* Check some prerequisites */
> +	if (!capable(CAP_IPC_LOCK))
> +		pre_error = -EPERM;
> +	else if (file)
> +		pre_error = -EINVAL;

We can't use the file argument, and the only caller passes NULL, so it
shouldn't be there at all.

> +	/* Explicit requests for huge pages are allowed to return errors */
> +	if (*flags & MAP_HUGETLB) {
> +		if (pre_error)
> +			return pre_error;
> +		return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
> +	}
> +
> +	/*
> +	 * When implicit request fails, return 0 so we can
> +	 * retry later with regular pages.
> +	 */
> +	if (mmap_hugetlb_implicit(len)) {
> +		if (pre_error)
> +			goto out;
> +		addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
> +		if (IS_ERR((void *)addr))
> +			goto out;
> +		else {
> +			*flags |= MAP_HUGETLB;
> +			return addr;
> +		}
> +	}
> +
> +out:
> +	*flags &= ~MAP_HUGETLB;
> +	return 0;
> +}

This does assume that 0 is never a valid address returned for a
hugepage range.  That's true now, but it makes be slightly
uncomfortable, since there's no inherent reason we couldn't make
segment zero a hugepage segment.

>  /*
>   * Read a page. Again trivial. If it didn't already exist
>   * in the page cache, it is zero-filled.
> diff -purN linux-2.6.0/include/asm-i386/mman.h linux-2.6.0-implicit/include/asm-i386/mman.h
> +++ linux-2.6.0-implicit/include/asm-i386/mman.h	2004-01-08 16:19:31.000000000 -0800
> @@ -11,6 +11,11 @@
>
>  #define MAP_SHARED	0x01		/* Share changes */
>  #define MAP_PRIVATE	0x02		/* Changes are private */
> +#ifdef CONFIG_HUGETLB_PAGE
> +#define MAP_HUGETLB	0x04		/* Use huge pages */
> +#else
> +#define MAP_HUGETLB	0x00
> +#endif
>  #define MAP_TYPE	0x0f		/* Mask for type of mapping */

I think MAP_HUGETLB should lie outside the MAP_TYPE bits.  It doesn't
specify a distinctly different mapping type like SHARED or PRIVATE, so
it belongs as a flag, not in the low bits.

Also, this is part of the ABI, so it shouldn't be conditional upon
CONFIG options.

>  #define MAP_FIXED	0x10		/* Interpret addr exactly */
>  #define MAP_ANONYMOUS	0x20		/* don't use a file */
> diff -purN linux-2.6.0/include/asm-ppc64/mman.h linux-2.6.0-implicit/include/asm-ppc64/mman.h
> +++ linux-2.6.0-implicit/include/asm-ppc64/mman.h	2004-01-08 16:19:31.000000000 -0800
> @@ -18,6 +18,11 @@
>
>  #define MAP_SHARED	0x01		/* Share changes */
>  #define MAP_PRIVATE	0x02		/* Changes are private */
> +#ifdef CONFIG_HUGETLB_PAGE
> +#define MAP_HUGETLB	0x04
> +#else
> +#define MAP_HUGETLB	0x0
> +#endif

Ditto.

>  #define MAP_TYPE	0x0f		/* Mask for type of mapping */
>  #define MAP_FIXED	0x10		/* Interpret addr exactly */
>  #define MAP_ANONYMOUS	0x20		/* don't use a file */
> diff -purN linux-2.6.0/include/linux/hugetlb.h linux-2.6.0-implicit/include/linux/hugetlb.h
> +++ linux-2.6.0-implicit/include/linux/hugetlb.h	2004-01-08 16:19:31.000000000 -0800
> @@ -118,4 +118,9 @@ static inline void set_file_hugepages(st
>
>  #endif /* !CONFIG_HUGETLBFS */
>
> +unsigned long
> +hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long,
> +								unsigned long, unsigned long);
> +
> +
>  #endif /* _LINUX_HUGETLB_H */
> diff -purN linux-2.6.0/include/linux/mman.h linux-2.6.0-implicit/include/linux/mman.h
> +++ linux-2.6.0-implicit/include/linux/mman.h	2004-01-08 16:19:31.000000000 -0800
> @@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags)
>  	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
>  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
>  	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
> +#ifdef CONFIG_HUGETLB_PAGE
> +               _calc_vm_trans(flags, MAP_HUGETLB,    VM_HUGETLB   ) |
> +#endif
>  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
>  }
>
> diff -purN linux-2.6.0/include/linux/sysctl.h linux-2.6.0-implicit/include/linux/sysctl.h
> +++ linux-2.6.0-implicit/include/linux/sysctl.h	2004-01-08 16:19:31.000000000 -0800
> @@ -127,6 +127,10 @@ enum
>  	KERN_PANIC_ON_OOPS=57,  /* int: whether we will panic on an oops */
>  	KERN_HPPA_PWRSW=58,	/* int: hppa soft-power enable */
>  	KERN_HPPA_UNALIGNED=59,	/* int: hppa unaligned-trap enable */
> +	KERN_SHMUSEHUGEPAGES=60,	/* int: back shm with huge pages */
> +	KERN_MMAPUSEHUGEPAGES=61,	/* int: back anon mmap with huge pages */
> +	KERN_HPAGES_PER_FILE=62,	/* int: max bigpages per file */
> +	KERN_HPAGES_MAP_SZ=63,		/* int: min size (MB) of mapping */
>  };
>
>
> diff -purN linux-2.6.0/ipc/shm.c linux-2.6.0-implicit/ipc/shm.c
> +++ linux-2.6.0-implicit/ipc/shm.c	2004-01-08 16:19:31.000000000 -0800
> @@ -32,6 +32,9 @@
>
>  #define shm_flags	shm_perm.mode
>
> +extern int shm_use_hugepages;
> +extern int shm_hugepages_per_file;
> +
>  static struct file_operations shm_file_operations;
>  static struct vm_operations_struct shm_vm_ops;
>
> @@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v
>  	.nopage	= shmem_nopage,
>  };
>
> +#ifdef CONFIG_HUGETLBFS
> +int shm_with_hugepages(int shmflag, size_t size)
> +{
> +	/* flag specified explicitly */
> +	if (shmflag & SHM_HUGETLB)
> +		return 1;
> +	/* Are we disabled? */
> +	if (!shm_use_hugepages)
> +		return 0;
> +	/* Must be HPAGE aligned */
> +	if (size & ~HPAGE_MASK)
> +		return 0;
> +	/* Are we under the max per file? */
> +	if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file)
> +		return 0;

I don't really understand this per-file restriction.  More comments
below.

> +	/* Do we have enough free huge pages? */
> +	if (!is_hugepage_mem_enough(size))
> +		return 0;

Same concerns with this test as in the mmap case.

> +	return 1;
> +}
> +#else
> +int shm_with_hugepages(int shmflag, size_t size) { return 0; }
> +#endif
> +
>  static int newseg (key_t key, int shmflg, size_t size)
>  {
>  	int error;
> @@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg
>  		return error;
>  	}
>
> -	if (shmflg & SHM_HUGETLB)
> +	if (shm_with_hugepages(shmflg, size)) {
> +		shmflg |= SHM_HUGETLB;
>  		file = hugetlb_zero_setup(size);
> +	}
>  	else {
>  		sprintf (name, "SYSV%08x", key);
>  		file = shmem_file_setup(name, size, VM_ACCOUNT);
> diff -purN linux-2.6.0/kernel/sysctl.c linux-2.6.0-implicit/kernel/sysctl.c
> +++ linux-2.6.0-implicit/kernel/sysctl.c	2004-01-08 16:19:31.000000000 -0800
> @@ -60,6 +60,8 @@ extern int cad_pid;
>  extern int pid_max;
>  extern int sysctl_lower_zone_protection;
>  extern int min_free_kbytes;
> +extern int shm_use_hugepages, shm_hugepages_per_file;
> +extern int mmap_use_hugepages, mmap_hugepages_map_sz;
>
>  /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
>  static int maxolduid = 65535;
> @@ -579,6 +581,40 @@ static ctl_table kern_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= &proc_dointvec,
>  	},
> +#ifdef CONFIG_HUGETLBFS
> +	{
> +		.ctl_name	= KERN_SHMUSEHUGEPAGES,
> +		.procname	= "shm-use-hugepages",
> +		.data		= &shm_use_hugepages,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec,
> +	},
> +	{
> +		.ctl_name	= KERN_MMAPUSEHUGEPAGES,
> +		.procname	= "mmap-use-hugepages",
> +		.data		= &mmap_use_hugepages,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec,
> +	},
> +	{
> +		.ctl_name	= KERN_HPAGES_PER_FILE,
> +		.procname	= "shm-hugepages-per-file",
> +		.data		= &shm_hugepages_per_file,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec,
> +	},
> +	{
> +		.ctl_name	= KERN_HPAGES_MAP_SZ,
> +		.procname	= "mmap-hugepages-min-mapping",
> +		.data		= &mmap_hugepages_map_sz,
> +		.maxlen		= sizeof(int),
> +		.mode		0644,
> +		.proc_handler	= &proc_dointvec,
> +	},
> +#endif
>  	{ .ctl_name = 0 }
>  };
>
> diff -purN linux-2.6.0/mm/mmap.c linux-2.6.0-implicit/mm/mmap.c
> +++ linux-2.6.0-implicit/mm/mmap.c	2004-01-08 16:20:10.000000000 -0800
> @@ -20,6 +20,7 @@
>  #include <linux/profile.h>
>  #include <linux/module.h>
>  #include <linux/mount.h>
> +#include <linux/err.h>
>
>  #include <asm/uaccess.h>
>  #include <asm/pgalloc.h>
> @@ -59,6 +60,9 @@ EXPORT_SYMBOL(sysctl_overcommit_memory);
>  EXPORT_SYMBOL(sysctl_overcommit_ratio);
>  EXPORT_SYMBOL(vm_committed_space);
>
> +int mmap_use_hugepages = 0;
> +int mmap_hugepages_map_sz = 256;
> +
>  /*
>   * Requires inode->i_mapping->i_shared_sem
>   */
> @@ -473,7 +477,7 @@ unsigned long do_mmap_pgoff(struct file
>  	int correct_wcount = 0;
>  	int error;
>  	struct rb_node ** rb_link, * rb_parent;
> -	unsigned long charged = 0;
> +	unsigned long charged = 0, addr_save = addr;
>
>  	if (file) {
>  		if (!file->f_op || !file->f_op->mmap)
> @@ -501,8 +505,17 @@ unsigned long do_mmap_pgoff(struct file
>
>  	/* Obtain the address to map to. we verify (or select) it and ensure
>  	 * that it represents a valid section of the address space.
> +	 * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is
> +	 * unset.
>  	 */
> -	addr = get_unmapped_area(file, addr, len, pgoff, flags);
> +#ifdef CONFIG_HUGETLBFS
> +	addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags);
> +	if (IS_ERR((void *)addr))
> +		return addr;

This doesn't look right - we don't fall back if try_hugetlb...()
fails.  But it can fail if we don't have the right permissions, for
one thing in which case we certainly do want to fall back.

> +	else if (addr == 0)
> +#endif
> +		addr = get_unmapped_area(file, addr_save, len, pgoff, flags);

Hmm... yes.  I think the logic would be simpler if try_hugetlb..()
always returned error, rather than zero and we fall back in all cases.
That also lets us eliminate the ugly #ifdef by defining
try_hugetlb...() to -ENOSYS in the !CONFIG_HUGETLBFS case.

>  	if (addr & ~PAGE_MASK)
>  		return addr;
>
> @@ -566,6 +579,9 @@ unsigned long do_mmap_pgoff(struct file
>  		default:
>  			return -EINVAL;
>  		case MAP_PRIVATE:
> +#ifdef CONFIG_HUGETLBFS
> +		case (MAP_PRIVATE|MAP_HUGETLB):
> +#endif

This bit of ugliness wouldn't be necessary if MAP_HUGETLB were up in
the high bits like it should be.

Also note that without my hugepage COW patches, MAP_PRIVATE semantics
don't actually work on hugepages.

>  			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
>  			/* fall through */
>  		case MAP_SHARED:
> @@ -650,10 +666,31 @@ munmap_back:
>  		error = file->f_op->mmap(file, vma);
>  		if (error)
>  			goto unmap_and_free_vma;
> -	} else if (vm_flags & VM_SHARED) {
> -		error = shmem_zero_setup(vma);
> -		if (error)
> -			goto free_vma;
> +	} else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) {
> +		if (!is_vm_hugetlb_page(vma)) {
> +			error = shmem_zero_setup(vma);
> +			if (error)
> +				goto free_vma;
> +		} else {
> +			/*
> +			 * Presumably hugetlb_zero_setup() acquires a
> +			 * reference count for us. The difference
> +			 * between this and the shmem_zero_setup()
> +			 * case is that we can encounter an error
> +			 * _after_ allocating the file. The error
> +			 * path was adjusted slightly to fput() for us.
> +			 */
> +			struct file *new_file = hugetlb_zero_setup(len);
> +			if (IS_ERR(new_file)) {
> +				error = PTR_ERR(new_file);
> +				goto free_vma;
> +			} else {
> +				vma->vm_file = new_file;
> +				error = new_file->f_op->mmap(new_file, vma);
> +				if (error)
> +					goto unmap_and_free_vma;
> +			}
> +		}
>  	}
>
>  	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
> @@ -701,11 +738,21 @@ out:
>  unmap_and_free_vma:
>  	if (correct_wcount)
>  		atomic_inc(&inode->i_writecount);
> -	vma->vm_file = NULL;
> -	fput(file);
>
> -	/* Undo any partial mapping done by a device driver. */
> +	/*
> +	 * Undo any partial mapping done by a device driver.
> +	 * hugetlb wants to know the vma's file etc. so nuke
> +	 * the file afterward.
> +	 */
>  	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
> +
> +	/*
> +	 * vma->vm_file may be different from file in the hugetlb case.
> +	 */
> +	if (vma->vm_file)
> +		fput(vma->vm_file);
> +	vma->vm_file = NULL;
> +
>  free_vma:
>  	kmem_cache_free(vm_area_cachep, vma);
>  unacct_error:
> diff -purN linux-2.6.0/mm/shmem.c linux-2.6.0-implicit/mm/shmem.c
> +++ linux-2.6.0-implicit/mm/shmem.c	2004-01-08 16:19:31.000000000 -0800
> @@ -40,6 +40,29 @@
>  #include <asm/uaccess.h>
>  #include <asm/div64.h>
>
> +int shm_use_hugepages;
> +
> +/*
> + * On 64bit archs the vmalloc area is very large,
> + * so we allocate the array in vmalloc on 64bit archs.
> + *
> + * Assuming 2M pages (x86 and x86-64) those default setting
> + * will allow up to 128G of bigpages in a single file on
> + * 64bit archs and 64G on 32bit archs using the max
> + * kmalloc size of 128k. So tweaking in practice is needed
> + * only to go past 128G of bigpages per file on 64bit archs.
> + *
> + * This sysctl is in page units (each page large BIGPAGE_SIZE).
> + */
> +#ifdef CONFIG_HUGETLBFS
> +#if BITS_PER_LONG == 64
> +int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT);
> +#else
> +int shm_hugepages_per_file = 131072 / sizeof(struct page *);
> +#endif
> +#endif

I'm not sure what array this is talking about.  I don't see why this
limit on the number of hugepages per file exists.

>  /* This magic number is used in glibc for posix shared memory */
>  #define TMPFS_MAGIC	0x01021994
>

--
David Gibson			| For every complex problem there is a
david AT gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/