Problems with THP in v4.5-rc4 on POWER

Paul Mackerras paulus at ozlabs.org
Sat Feb 20 12:39:42 AEDT 2016


It seems there's something wrong with our transparent hugepage
implementation on POWER server processors as of v4.5-rc4.  I have seen
the email thread on "[BUG] random kernel crashes after THP rework on
s390 (maybe also on PowerPC and ARM)", but this doesn't seem exactly
the same as that (though it may of course be related).

I have been testing v4.5-rc4 with Aneesh's patch "powerpc/mm/hash:
Clear the invalid slot information correctly" on top, on a KVM guest
with 160 vcpus (threads=8) and 32GB of memory backed by 16MB large
pages, running on a POWER8 machine running a 4.4.1 host kernel (20
cores * 8 threads, 128GB of RAM).  The guest kernel is compiled with
THP enabled and set to "always" (i.e. not "madvise").

On this setup, when doing something like a large kernel compile, I see
random segfaults happening (in gcc, cc1, sh, etc.).  I also see bursts
of messages like this on the host console:

[50957.570859] Harmless Hypervisor Maintenance interrupt [Recovered]
[50957.570864]  Error detail: Processor Recovery done
[50957.570869]  HMER: 2040000000000000

and once I saw an unrecoverable HMI that crashed the host.  I don't
see anything like this with a 4.3.0 kernel, and the same kernel
compile proceeds flawlessly.

One thing I discovered when debugging is that we are getting to
flush_hash_hugepage() with no slot array available
(i.e. get_hpte_slot_array(pmdp) returns NULL).  So I added code to do
an explicit search for any HPTEs that need to be flushed.  The patch
to do this is below.  For the pSeries LPAR case (which this is) I also
print a message with the number of HPTEs that were found.  When
running with this patch, I see the message printed out from time to
time, like this:

pSeries_lpar_hugepage_invalidate: no slot array, found 25 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 1 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 26 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 3 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 13 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 2 HPTEs
grep thp /proc/vmstat 
thp_fault_alloc 3253
thp_fault_fallback 0
thp_collapse_alloc 5
thp_collapse_alloc_failed 0
thp_split_page 0
thp_split_page_failed 0
thp_split_pmd 9
thp_zero_page_alloc 1
thp_zero_page_alloc_failed 0
[root at dyn386 ~]# pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 29 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 196 HPTEs
gcc[58246]: unhandled signal 11 at 0000000000000008 nip 00003fff7a7d5a08 lr 00003fff7a7d57f4 code 30001
pSeries_lpar_hugepage_invalidate: no slot array, found 256 HPTEs

So in fact there are HPTEs to be invalidated in the cases where we
don't have the slot array - which could be an explanation for apparent
memory corruption.  However, doing the invalidations doesn't fix the
problems.  (Possibly there is a bug in my patch and we are still not
invalidating the HPTEs correctly.)

Interestingly, whenever the "no slot array" message happens, I
immediately see a burst of the HMI interrupt messages on the host
console.  So whatever we are doing wrong is creating a situation where
the CPU thinks it has encountered an error and needs to recover.

Thoughts? anyone?

Paul.
---
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 8eaac81..a433e16 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -339,22 +339,28 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 	struct hash_pte *hptep;
 	unsigned long hash;
 	unsigned long i;
-	long slot;
+	long slot, tries;
 	unsigned long want_v, hpte_v;
+	unsigned long mask;
 
 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_avpn(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize) | HPTE_V_VALID;
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
 
-	/* Bolted mappings are only ever in the primary group */
+	/* Search the primary group then the secondary */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		hptep = htab_address + slot;
-		hpte_v = be64_to_cpu(hptep->v);
+	for (tries = 0; tries < 2; ++tries) {
+		for (i = 0; i < HPTES_PER_GROUP; i++) {
+			hptep = htab_address + slot;
+			hpte_v = be64_to_cpu(hptep->v);
 
-		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-			/* HPTE matches */
-			return slot;
-		++slot;
+			if ((hpte_v & mask) == want_v)
+				/* HPTE matches */
+				return slot;
+			++slot;
+		}
+		want_v |= HPTE_V_SECONDARY;
+		slot = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
 	}
 
 	return -1;
@@ -448,20 +454,27 @@ static void native_hugepage_invalidate(unsigned long vsid,
 
 	local_irq_save(flags);
 	for (i = 0; i < max_hpte_count; i++) {
-		valid = hpte_valid(hpte_slot_array, i);
-		if (!valid)
-			continue;
-		hidx =  hpte_hash_index(hpte_slot_array, i);
-
 		/* get the vpn */
 		addr = s_addr + (i * (1ul << shift));
 		vpn = hpt_vpn(addr, vsid, ssize);
-		hash = hpt_hash(vpn, shift, ssize);
-		if (hidx & _PTEIDX_SECONDARY)
-			hash = ~hash;
+		if (hpte_slot_array) {
+			valid = hpte_valid(hpte_slot_array, i);
+			if (!valid)
+				continue;
+			hidx =  hpte_hash_index(hpte_slot_array, i);
 
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += hidx & _PTEIDX_GROUP_IX;
+			hash = hpt_hash(vpn, shift, ssize);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+		} else {
+			/* no slot array, will have to search for the HPTE */
+			slot = native_hpte_find(vpn, psize, ssize);
+			if (slot == (unsigned long)-1)
+				continue;
+		}
 
 		hptep = htab_address + slot;
 		want_v = hpte_encode_avpn(vpn, psize, ssize);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 47a0bc1..777ab6b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1348,6 +1348,13 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
 
 	s_addr = addr & HPAGE_PMD_MASK;
 	hpte_slot_array = get_hpte_slot_array(pmdp);
+
+	if (ppc_md.hugepage_invalidate) {
+		ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+					   psize, ssize, local);
+		goto tm_abort;
+	}
+
 	/*
 	 * IF we try to do a HUGE PTE update after a withdraw is done.
 	 * we will find the below NULL. This happens when we do
@@ -1356,13 +1363,8 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
 	if (!hpte_slot_array)
 		return;
 
-	if (ppc_md.hugepage_invalidate) {
-		ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
-					   psize, ssize, local);
-		goto tm_abort;
-	}
 	/*
-	 * No bluk hpte removal support, invalidate each entry
+	 * No bulk hpte removal support, invalidate each entry
 	 */
 	shift = mmu_psize_defs[psize].shift;
 	max_hpte_count = HPAGE_PMD_SIZE >> shift;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 477290a..142e669 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -323,7 +323,10 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_gr
 		unsigned long pteh;
 		unsigned long ptel;
 	} ptes[4];
+	unsigned long mask;
 
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
+	want_v |= HPTE_V_VALID;
 	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
 
 		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
@@ -331,8 +334,7 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_gr
 			continue;
 
 		for (j = 0; j < 4; j++) {
-			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
-			    (ptes[j].pteh & HPTE_V_VALID))
+			if ((ptes[j].pteh & mask) == want_v)
 				return i + j;
 		}
 	}
@@ -350,11 +352,17 @@ static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
 	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
-	/* Bolted entries are always in the primary group */
 	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
-	if (slot < 0)
-		return -1;
+	if (slot < 0) {
+		/* try the secondary group */
+		hash = ~hash;
+		want_v |= HPTE_V_SECONDARY;
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+		if (slot < 0)
+			return -1;
+	}
 	return hpte_group + slot;
 }
 
@@ -451,37 +459,46 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
 					     unsigned char *hpte_slot_array,
 					     int psize, int ssize, int local)
 {
-	int i, index = 0;
+	long i, index = 0;
 	unsigned long s_addr = addr;
 	unsigned int max_hpte_count, valid;
 	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
 	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
 	unsigned long shift, hidx, vpn = 0, hash, slot;
+	unsigned long count = 0;
 
 	shift = mmu_psize_defs[psize].shift;
 	max_hpte_count = 1U << (PMD_SHIFT - shift);
 
 	for (i = 0; i < max_hpte_count; i++) {
-		valid = hpte_valid(hpte_slot_array, i);
-		if (!valid)
-			continue;
-		hidx =  hpte_hash_index(hpte_slot_array, i);
-
 		/* get the vpn */
-		addr = s_addr + (i * (1ul << shift));
+		addr = s_addr + (i << shift);
 		vpn = hpt_vpn(addr, vsid, ssize);
-		hash = hpt_hash(vpn, shift, ssize);
-		if (hidx & _PTEIDX_SECONDARY)
-			hash = ~hash;
+		if (hpte_slot_array) {
+			valid = hpte_valid(hpte_slot_array, i);
+			if (!valid)
+				continue;
+			hidx =  hpte_hash_index(hpte_slot_array, i);
 
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += hidx & _PTEIDX_GROUP_IX;
+			hash = hpt_hash(vpn, shift, ssize);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+
+		} else {
+			slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+			if (slot == (unsigned long)-1)
+				continue;
+			++count;
+		}
 
 		slot_array[index] = slot;
 		vpn_array[index] = vpn;
 		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
 			/*
-			 * Now do a bluk invalidate
+			 * Now do a bulk invalidate
 			 */
 			__pSeries_lpar_hugepage_invalidate(slot_array,
 							   vpn_array,
@@ -494,6 +511,8 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
 	if (index)
 		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
 						   index, psize, ssize);
+	if (!hpte_slot_array)
+		pr_info("pSeries_lpar_hugepage_invalidate: no slot array, found %ld HPTEs\n", count);
 }
 #else
 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,


More information about the Linuxppc-dev mailing list