[PATCH v12 13/31] mm: cache some VMA fields in the vm_fault structure

Tue Apr 23 06:06:54 AEST 2019

On Tue, Apr 16, 2019 at 03:45:04PM +0200, Laurent Dufour wrote:
> When handling speculative page fault, the vma->vm_flags and
> vma->vm_page_prot fields are read once the page table lock is released. So
> there is no more guarantee that these fields would not change in our back.
> They will be saved in the vm_fault structure before the VMA is checked for
> changes.
> 
> In the detail, when we deal with a speculative page fault, the mmap_sem is
> not taken, so parallel VMA's changes can occurred. When a VMA change is
> done which will impact the page fault processing, we assumed that the VMA
> sequence counter will be changed.  In the page fault processing, at the
> time the PTE is locked, we checked the VMA sequence counter to detect
> changes done in our back. If no change is detected we can continue further.
> But this doesn't prevent the VMA to not be changed in our back while the
> PTE is locked. So VMA's fields which are used while the PTE is locked must
> be saved to ensure that we are using *static* values.  This is important
> since the PTE changes will be made on regards to these VMA fields and they
> need to be consistent. This concerns the vma->vm_flags and
> vma->vm_page_prot VMA fields.
> 
> This patch also set the fields in hugetlb_no_page() and
> __collapse_huge_page_swapin even if it is not need for the callee.
> 
> Signed-off-by: Laurent Dufour <ldufour at linux.ibm.com>

I am unsure about something see below, so you might need to update
that one but it would not change the structure of the patch thus:

Reviewed-by: Jérôme Glisse <jglisse at redhat.com>

> ---
>  include/linux/mm.h | 10 +++++++--
>  mm/huge_memory.c   |  6 +++---
>  mm/hugetlb.c       |  2 ++
>  mm/khugepaged.c    |  2 ++
>  mm/memory.c        | 53 ++++++++++++++++++++++++----------------------
>  mm/migrate.c       |  2 +-
>  6 files changed, 44 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5d45b7d8718d..f465bb2b049e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -439,6 +439,12 @@ struct vm_fault {
>  					 * page table to avoid allocation from
>  					 * atomic context.
>  					 */
> +	/*
> +	 * These entries are required when handling speculative page fault.
> +	 * This way the page handling is done using consistent field values.
> +	 */
> +	unsigned long vma_flags;
> +	pgprot_t vma_page_prot;
>  };
>  
>  /* page entry size for vm->huge_fault() */
> @@ -781,9 +787,9 @@ void free_compound_page(struct page *page);
>   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
>   * that do not have writing enabled, when used by access_process_vm.
>   */
> -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
>  {
> -	if (likely(vma->vm_flags & VM_WRITE))
> +	if (likely(vma_flags & VM_WRITE))
>  		pte = pte_mkwrite(pte);
>  	return pte;
>  }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 823688414d27..865886a689ee 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1244,8 +1244,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
>  
>  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
>  		pte_t entry;
> -		entry = mk_pte(pages[i], vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		entry = mk_pte(pages[i], vmf->vma_page_prot);
> +		entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
>  		memcg = (void *)page_private(pages[i]);
>  		set_page_private(pages[i], 0);
>  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> @@ -2228,7 +2228,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  				entry = pte_swp_mksoft_dirty(entry);
>  		} else {
>  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
> -			entry = maybe_mkwrite(entry, vma);
> +			entry = maybe_mkwrite(entry, vma->vm_flags);
>  			if (!write)
>  				entry = pte_wrprotect(entry);
>  			if (!young)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 109f5de82910..13246da4bc50 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3812,6 +3812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>  				.vma = vma,
>  				.address = haddr,
>  				.flags = flags,
> +				.vma_flags = vma->vm_flags,
> +				.vma_page_prot = vma->vm_page_prot,

Shouldn't you use READ_ONCE ? I doubt compiler will do something creative
with struct initialization but as you are using WRITE_ONCE to update those
fields maybe pairing read with READ_ONCE where the mmap_sem is not always
taken might make sense.

>  				/*
>  				 * Hard to debug if it ends up being
>  				 * used by a callee that assumes
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6a0cbca3885e..42469037240a 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -888,6 +888,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  		.flags = FAULT_FLAG_ALLOW_RETRY,
>  		.pmd = pmd,
>  		.pgoff = linear_page_index(vma, address),
> +		.vma_flags = vma->vm_flags,
> +		.vma_page_prot = vma->vm_page_prot,

Same as above.

[...]

>  	return VM_FAULT_FALLBACK;
> @@ -3924,6 +3925,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
>  		.flags = flags,
>  		.pgoff = linear_page_index(vma, address),
>  		.gfp_mask = __get_fault_gfp_mask(vma),
> +		.vma_flags = vma->vm_flags,
> +		.vma_page_prot = vma->vm_page_prot,

Same as above

>  	};
>  	unsigned int dirty = flags & FAULT_FLAG_WRITE;
>  	struct mm_struct *mm = vma->vm_mm;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f2ecc2855a12..a9138093a8e2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
>  		 */
>  		entry = pte_to_swp_entry(*pvmw.pte);
>  		if (is_write_migration_entry(entry))
> -			pte = maybe_mkwrite(pte, vma);
> +			pte = maybe_mkwrite(pte, vma->vm_flags);
>  
>  		if (unlikely(is_zone_device_page(new))) {
>  			if (is_device_private_page(new)) {
> -- 
> 2.21.0
>