[RFC PATCH 1/1] powerpc/pgtable: Skip serialize_against_pte_lookup() when unmapping

Leonardo Bras leonardo at linux.ibm.com
Tue Jan 14 08:11:26 AEDT 2020


If a process (qemu) with a lot of CPUs (128) try to munmap() a large
chunk of memory (496GB) mapped with THP, it takes an average of 275
seconds, which can cause a lot of problems to the load (in qemu case,
the guest will lock for this time).

Trying to find the source of this bug, I found out most of this time is
spent on serialize_against_pte_lookup(). This function will take a lot
of time in smp_call_function_many() if there is more than a couple CPUs
running the user process. Since it has to happen to all THP mapped, it
will take a very long time for large amounts of memory.

By the docs, serialize_against_pte_lookup() is needed in order to avoid
pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless
pagetable walk, to happen concurrently with THP splitting/collapsing.

In this case, as the page is being munmapped, there is no need to call
serialize_against_pte_lookup(), given it will not be used after or
during munmap.

This patch does so by adding option to skip serializing on
radix__pmdp_huge_get_and_clear(). This option is used by the proxy
__pmdp_huge_get_and_clear(), that is called with 'unmap == true' on
an (new) arch version of pmdp_huge_get_and_clear_full(), and with
'unmap == false' on pmdp_huge_get_and_clear(), that is used on
generic code.

pmdp_huge_get_and_clear_full() is only called in zap_huge_pmd(), so
it's safe to assume it will always be called on memory that will be
unmapped.

On my workload (qemu: 128vcpus + 500GB), I could see munmap's time
reduction from 275 seconds to 39ms.

Signed-off-by: Leonardo Bras <leonardo at linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 25 +++++++++++++++++---
 arch/powerpc/include/asm/book3s/64/radix.h   |  3 ++-
 arch/powerpc/mm/book3s64/radix_pgtable.c     |  6 +++--
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..5e3e7c48624a 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1243,14 +1243,21 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long address, pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-					    unsigned long addr, pmd_t *pmdp)
+static inline pmd_t __pmdp_huge_get_and_clear(struct mm_struct *mm,
+					      unsigned long addr, pmd_t *pmdp,
+					      bool unmap)
 {
 	if (radix_enabled())
-		return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+		return radix__pmdp_huge_get_and_clear(mm, addr, pmdp, !unmap);
 	return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
 }
 
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long addr, pmd_t *pmdp)
+{
+	return __pmdp_huge_get_and_clear(mm, addr, pmdp, false);
+}
+
 static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp)
 {
@@ -1337,6 +1344,18 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
 void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
 			     pte_t *, pte_t, pte_t);
 
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+						 unsigned long address,
+						 pmd_t *pmdp,
+						 int full)
+{
+	/*
+	 * Called only on unmapping
+	 */
+	return __pmdp_huge_get_and_clear(mm, address, pmdp, true);
+}
+
 /*
  * Returns true for a R -> RW upgrade of pte
  */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..148874aa5260 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -253,7 +253,8 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 					pgtable_t pgtable);
 extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
-				      unsigned long addr, pmd_t *pmdp);
+					    unsigned long addr, pmd_t *pmdp,
+					    bool serialize);
 static inline int radix__has_transparent_hugepage(void)
 {
 	/* For radix 2M at PMD level means thp */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 974109bb85db..eac8409cd316 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1007,7 +1007,8 @@ pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 }
 
 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
-				     unsigned long addr, pmd_t *pmdp)
+				     unsigned long addr, pmd_t *pmdp,
+				     bool serialize)
 {
 	pmd_t old_pmd;
 	unsigned long old;
@@ -1024,7 +1025,8 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	 * different code paths. So make sure we wait for the parallel
 	 * find_current_mm_pte to finish.
 	 */
-	serialize_against_pte_lookup(mm);
+	if (serialize)
+		serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
-- 
2.24.1



More information about the Linuxppc-dev mailing list