Problems with THP in v4.5-rc4 on POWER
Paul Mackerras
paulus at ozlabs.org
Sat Feb 20 12:39:42 AEDT 2016
It seems there's something wrong with our transparent hugepage
implementation on POWER server processors as of v4.5-rc4. I have seen
the email thread on "[BUG] random kernel crashes after THP rework on
s390 (maybe also on PowerPC and ARM)", but this doesn't seem exactly
the same as that (though it may of course be related).
I have been testing v4.5-rc4 with Aneesh's patch "powerpc/mm/hash:
Clear the invalid slot information correctly" on top, on a KVM guest
with 160 vcpus (threads=8) and 32GB of memory backed by 16MB large
pages, running on a POWER8 machine running a 4.4.1 host kernel (20
cores * 8 threads, 128GB of RAM). The guest kernel is compiled with
THP enabled and set to "always" (i.e. not "madvise").
On this setup, when doing something like a large kernel compile, I see
random segfaults happening (in gcc, cc1, sh, etc.). I also see bursts
of messages like this on the host console:
[50957.570859] Harmless Hypervisor Maintenance interrupt [Recovered]
[50957.570864] Error detail: Processor Recovery done
[50957.570869] HMER: 2040000000000000
and once I saw an unrecoverable HMI that crashed the host. I don't
see anything like this with a 4.3.0 kernel, and the same kernel
compile proceeds flawlessly.
One thing I discovered when debugging is that we are getting to
flush_hash_hugepage() with no slot array available
(i.e. get_hpte_slot_array(pmdp) returns NULL). So I added code to do
an explicit search for any HPTEs that need to be flushed. The patch
to do this is below. For the pSeries LPAR case (which this is) I also
print a message with the number of HPTEs that were found. When
running with this patch, I see the message printed out from time to
time, like this:
pSeries_lpar_hugepage_invalidate: no slot array, found 25 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 1 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 26 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 3 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 13 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 2 HPTEs
grep thp /proc/vmstat
thp_fault_alloc 3253
thp_fault_fallback 0
thp_collapse_alloc 5
thp_collapse_alloc_failed 0
thp_split_page 0
thp_split_page_failed 0
thp_split_pmd 9
thp_zero_page_alloc 1
thp_zero_page_alloc_failed 0
[root at dyn386 ~]# pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 29 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 196 HPTEs
gcc[58246]: unhandled signal 11 at 0000000000000008 nip 00003fff7a7d5a08 lr 00003fff7a7d57f4 code 30001
pSeries_lpar_hugepage_invalidate: no slot array, found 256 HPTEs
So in fact there are HPTEs to be invalidated in the cases where we
don't have the slot array - which could be an explanation for apparent
memory corruption. However, doing the invalidations doesn't fix the
problems. (Possibly there is a bug in my patch and we are still not
invalidating the HPTEs correctly.)
Interestingly, whenever the "no slot array" message happens, I
immediately see a burst of the HMI interrupt messages on the host
console. So whatever we are doing wrong is creating a situation where
the CPU thinks it has encountered an error and needs to recover.
Thoughts? anyone?
Paul.
---
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 8eaac81..a433e16 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -339,22 +339,28 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
struct hash_pte *hptep;
unsigned long hash;
unsigned long i;
- long slot;
+ long slot, tries;
unsigned long want_v, hpte_v;
+ unsigned long mask;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_avpn(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize) | HPTE_V_VALID;
+ mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
- /* Bolted mappings are only ever in the primary group */
+ /* Search the primary group then the secondary */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- for (i = 0; i < HPTES_PER_GROUP; i++) {
- hptep = htab_address + slot;
- hpte_v = be64_to_cpu(hptep->v);
+ for (tries = 0; tries < 2; ++tries) {
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hptep = htab_address + slot;
+ hpte_v = be64_to_cpu(hptep->v);
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
- /* HPTE matches */
- return slot;
- ++slot;
+ if ((hpte_v & mask) == want_v)
+ /* HPTE matches */
+ return slot;
+ ++slot;
+ }
+ want_v |= HPTE_V_SECONDARY;
+ slot = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
}
return -1;
@@ -448,20 +454,27 @@ static void native_hugepage_invalidate(unsigned long vsid,
local_irq_save(flags);
for (i = 0; i < max_hpte_count; i++) {
- valid = hpte_valid(hpte_slot_array, i);
- if (!valid)
- continue;
- hidx = hpte_hash_index(hpte_slot_array, i);
-
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
+ if (hpte_slot_array) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ } else {
+ /* no slot array, will have to search for the HPTE */
+ slot = native_hpte_find(vpn, psize, ssize);
+ if (slot == (unsigned long)-1)
+ continue;
+ }
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 47a0bc1..777ab6b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1348,6 +1348,13 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
s_addr = addr & HPAGE_PMD_MASK;
hpte_slot_array = get_hpte_slot_array(pmdp);
+
+ if (ppc_md.hugepage_invalidate) {
+ ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+ psize, ssize, local);
+ goto tm_abort;
+ }
+
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
@@ -1356,13 +1363,8 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
if (!hpte_slot_array)
return;
- if (ppc_md.hugepage_invalidate) {
- ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
- psize, ssize, local);
- goto tm_abort;
- }
/*
- * No bluk hpte removal support, invalidate each entry
+ * No bulk hpte removal support, invalidate each entry
*/
shift = mmu_psize_defs[psize].shift;
max_hpte_count = HPAGE_PMD_SIZE >> shift;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 477290a..142e669 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -323,7 +323,10 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_gr
unsigned long pteh;
unsigned long ptel;
} ptes[4];
+ unsigned long mask;
+ mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
+ want_v |= HPTE_V_VALID;
for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
@@ -331,8 +334,7 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_gr
continue;
for (j = 0; j < 4; j++) {
- if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
- (ptes[j].pteh & HPTE_V_VALID))
+ if ((ptes[j].pteh & mask) == want_v)
return i + j;
}
}
@@ -350,11 +352,17 @@ static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
want_v = hpte_encode_avpn(vpn, psize, ssize);
- /* Bolted entries are always in the primary group */
hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
- if (slot < 0)
- return -1;
+ if (slot < 0) {
+ /* try the secondary group */
+ hash = ~hash;
+ want_v |= HPTE_V_SECONDARY;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+ if (slot < 0)
+ return -1;
+ }
return hpte_group + slot;
}
@@ -451,37 +459,46 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
unsigned char *hpte_slot_array,
int psize, int ssize, int local)
{
- int i, index = 0;
+ long i, index = 0;
unsigned long s_addr = addr;
unsigned int max_hpte_count, valid;
unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
unsigned long shift, hidx, vpn = 0, hash, slot;
+ unsigned long count = 0;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
for (i = 0; i < max_hpte_count; i++) {
- valid = hpte_valid(hpte_slot_array, i);
- if (!valid)
- continue;
- hidx = hpte_hash_index(hpte_slot_array, i);
-
/* get the vpn */
- addr = s_addr + (i * (1ul << shift));
+ addr = s_addr + (i << shift);
vpn = hpt_vpn(addr, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
+ if (hpte_slot_array) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ } else {
+ slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+ if (slot == (unsigned long)-1)
+ continue;
+ ++count;
+ }
slot_array[index] = slot;
vpn_array[index] = vpn;
if (index == PPC64_HUGE_HPTE_BATCH - 1) {
/*
- * Now do a bluk invalidate
+ * Now do a bulk invalidate
*/
__pSeries_lpar_hugepage_invalidate(slot_array,
vpn_array,
@@ -494,6 +511,8 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
if (index)
__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
index, psize, ssize);
+ if (!hpte_slot_array)
+ pr_info("pSeries_lpar_hugepage_invalidate: no slot array, found %ld HPTEs\n", count);
}
#else
static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
More information about the Linuxppc-dev
mailing list