[RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE

Wed Aug 4 16:39:44 AEST 2021

Excerpts from Nicholas Piggin's message of August 4, 2021 3:14 pm:
> Excerpts from Aneesh Kumar K.V's message of August 4, 2021 12:37 am:
>> With shared mapping, even though we are unmapping a large range, the kernel
>> will force a TLB flush with ptl lock held to avoid the race mentioned in
>> commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and memory freeing parts")
>> This results in the kernel issuing a high number of TLB flushes even for a large
>> range. This can be improved by making sure the kernel switch to pid based flush if the
>> kernel is unmapping a 2M range.
> 
> It would be good to have a bit more description here.
> 
> In any patch that changes a heuristic like this, I would like to see 
> some justification or reasoning that could be refuted or used as a 
> supporting argument if we ever wanted to change the heuristic later.
> Ideally with some of the obvious downsides listed as well.
> 
> This "improves" things here, but what if it hurt things elsewhere, how 
> would we come in later and decide to change it back?
> 
> THP flushes for example, I think now they'll do PID flushes (if they 
> have to be broadcast, which they will tend to be when khugepaged does
> them). So now that might increase jitter for THP and cause it to be a
> loss for more workloads.
> 
> So where do you notice this? What's the benefit?

For that matter, I wonder if we shouldn't do something like this 
(untested) so the low level batch flush has visibility to the high
level flush range.

x86 could use this too AFAIKS, just needs to pass the range a bit
further down, but in practice I'm not sure it would ever really
matter for them because the 2MB level exceeds the single page flush
ceiling for 4k pages unlike powerpc with 64k pages. But in corner
cases where the unmap crossed a bunch of small vmas or the ceiling
was increased then in theory it could be of use.

Subject: [PATCH v1] mm/mmu_gather: provide archs with the entire range that is
 to be flushed, not just the particular gather

This allows archs to optimise flushing heuristics better, in the face of
flush operations forcing smaller flush batches. For example, an
architecture may choose a more costly per-page invalidation for small
ranges of pages with the assumption that the full TLB flush cost would
be more expensive in terms of refills. However if a very large range is
forced into flushing small ranges, the faster full-process flush may
have been the better choice.

---
 arch/powerpc/mm/book3s64/radix_tlb.c | 33 ++++++++++++++++------------
 fs/exec.c                            |  3 ++-
 include/asm-generic/tlb.h            |  9 ++++++++
 include/linux/mm_types.h             |  3 ++-
 mm/hugetlb.c                         |  2 +-
 mm/madvise.c                         |  6 ++---
 mm/memory.c                          |  4 ++--
 mm/mmap.c                            |  2 +-
 mm/mmu_gather.c                      | 10 ++++++---
 mm/oom_kill.c                        |  2 +-
 10 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..e1072d85d72e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -1110,12 +1110,13 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
 
 static inline void __radix__flush_tlb_range(struct mm_struct *mm,
-					    unsigned long start, unsigned long end)
+					    unsigned long start, unsigned long end,
+					    unsigned long entire_range)
 {
 	unsigned long pid;
 	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
 	unsigned long page_size = 1UL << page_shift;
-	unsigned long nr_pages = (end - start) >> page_shift;
+	unsigned long entire_nr_pages = entire_range >> page_shift;
 	bool fullmm = (end == TLB_FLUSH_ALL);
 	bool flush_pid, flush_pwc = false;
 	enum tlb_flush_type type;
@@ -1133,9 +1134,9 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 	if (fullmm)
 		flush_pid = true;
 	else if (type == FLUSH_TYPE_GLOBAL)
-		flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+		flush_pid = entire_nr_pages > tlb_single_page_flush_ceiling;
 	else
-		flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+		flush_pid = entire_nr_pages > tlb_local_single_page_flush_ceiling;
 	/*
 	 * full pid flush already does the PWC flush. if it is not full pid
 	 * flush check the range is more than PMD and force a pwc flush
@@ -1220,7 +1221,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		return radix__flush_hugetlb_tlb_range(vma, start, end);
 #endif
 
-	__radix__flush_tlb_range(vma->vm_mm, start, end);
+	__radix__flush_tlb_range(vma->vm_mm, start, end, end - start);
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -1278,6 +1279,11 @@ void radix__flush_all_lpid_guest(unsigned int lpid)
 	_tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
 }
 
+static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				unsigned long entire_range,
+				int psize, bool also_pwc);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
 	int psize = 0;
@@ -1285,6 +1291,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	int page_size = tlb->page_size;
 	unsigned long start = tlb->start;
 	unsigned long end = tlb->end;
+	unsigned long entire_range = tlb->entire_end - tlb->entire_start;
 
 	/*
 	 * if page size is not something we understand, do a full mm flush
@@ -1301,21 +1308,19 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 		else
 			radix__flush_all_mm(mm);
 	} else {
-		if (!tlb->freed_tables)
-			radix__flush_tlb_range_psize(mm, start, end, psize);
-		else
-			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+		__radix__flush_tlb_range_psize(mm, start, end, entire_range, psize, tlb->freed_tables);
 	}
 }
 
 static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 				unsigned long start, unsigned long end,
+				unsigned long entire_range,
 				int psize, bool also_pwc)
 {
 	unsigned long pid;
 	unsigned int page_shift = mmu_psize_defs[psize].shift;
 	unsigned long page_size = 1UL << page_shift;
-	unsigned long nr_pages = (end - start) >> page_shift;
+	unsigned long entire_nr_pages = entire_range >> page_shift;
 	bool fullmm = (end == TLB_FLUSH_ALL);
 	bool flush_pid;
 	enum tlb_flush_type type;
@@ -1335,9 +1340,9 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 	if (fullmm)
 		flush_pid = true;
 	else if (type == FLUSH_TYPE_GLOBAL)
-		flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+		flush_pid = entire_nr_pages > tlb_single_page_flush_ceiling;
 	else
-		flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+		flush_pid = entire_nr_pages > tlb_local_single_page_flush_ceiling;
 
 	if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
 		unsigned long tgt = H_RPTI_TARGET_CMMU;
@@ -1381,13 +1386,13 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize)
 {
-	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+	return __radix__flush_tlb_range_psize(mm, start, end, end - start, psize, false);
 }
 
 void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
 				      unsigned long end, int psize)
 {
-	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
+	__radix__flush_tlb_range_psize(mm, start, end, end - start, psize, true);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..c769c12bdf56 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -705,11 +705,11 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm);
 	if (new_end > old_start) {
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
+		tlb_gather_mmu(&tlb, mm, new_end, old_end);
 		free_pgd_range(&tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 	} else {
@@ -719,6 +719,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
+		tlb_gather_mmu(&tlb, mm, old_start, old_end);
 		free_pgd_range(&tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 	}
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 2c68a545ffa7..857fd83af695 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -256,6 +256,15 @@ struct mmu_gather {
 	struct mmu_table_batch	*batch;
 #endif
 
+	/*
+	 * This is the range of the "entire" logical flush
+	 * operation being performed. It does not relate to
+	 * the current batch to flush, but it can inform
+	 * heuristics that choose the best flushing strategy.
+	 */
+	unsigned long		entire_start;
+	unsigned long		entire_end;
+
 	unsigned long		start;
 	unsigned long		end;
 	/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 52bbd2b7cb46..9d2ff06b574c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -599,7 +599,8 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 }
 
 struct mmu_gather;
-extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+				unsigned long start, unsigned long end);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dfc940d5221d..e41106eb4df7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4458,7 +4458,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
 	struct mmu_gather tlb;
 
-	tlb_gather_mmu(&tlb, vma->vm_mm);
+	tlb_gather_mmu(&tlb, vma->vm_mm, start, end);
 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
 	tlb_finish_mmu(&tlb);
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 6d3d348b17f4..b3634672aeb9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -508,7 +508,7 @@ static long madvise_cold(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm);
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 	tlb_finish_mmu(&tlb);
 
@@ -561,7 +561,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
 		return 0;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm);
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 	tlb_finish_mmu(&tlb);
 
@@ -726,7 +726,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 				range.start, range.end);
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm);
+	tlb_gather_mmu(&tlb, mm, range.start, range.end);
 	update_hiwater_rss(mm);
 
 	mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/memory.c b/mm/memory.c
index 25fc46e87214..61c303e84baf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1647,7 +1647,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				start, start + size);
-	tlb_gather_mmu(&tlb, vma->vm_mm);
+	tlb_gather_mmu(&tlb, vma->vm_mm, range.start, range.end);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
 	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
@@ -1674,7 +1674,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				address, address + size);
-	tlb_gather_mmu(&tlb, vma->vm_mm);
+	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
 	unmap_single_vma(&tlb, vma, address, range.end, details);
diff --git a/mm/mmap.c b/mm/mmap.c
index ca54d36d203a..f2808febde40 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2672,7 +2672,7 @@ static void unmap_region(struct mm_struct *mm,
 	struct mmu_gather tlb;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm);
+	tlb_gather_mmu(&tlb, mm, start, end);
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..863a5bd7e650 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -250,9 +250,12 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
 }
 
 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			     unsigned long start, unsigned long end,
 			     bool fullmm)
 {
 	tlb->mm = mm;
+	tlb->entire_start = start;
+	tlb->entire_end = end;
 	tlb->fullmm = fullmm;
 
 #ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -281,9 +284,10 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
  * Called to initialize an (on-stack) mmu_gather structure for page-table
  * tear-down from @mm.
  */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		    unsigned long start, unsigned long end)
 {
-	__tlb_gather_mmu(tlb, mm, false);
+	__tlb_gather_mmu(tlb, mm, start, end, false);
 }
 
 /**
@@ -299,7 +303,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
  */
 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 {
-	__tlb_gather_mmu(tlb, mm, true);
+	__tlb_gather_mmu(tlb, mm, 0, ~0UL, true);
 }
 
 /**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c729a4c4a1ac..bfcc9cbdfb20 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -545,7 +545,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 			mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
 						vma, mm, vma->vm_start,
 						vma->vm_end);
-			tlb_gather_mmu(&tlb, mm);
+			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
 			if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
 				tlb_finish_mmu(&tlb);
 				ret = false;
-- 
2.23.0