It seems there's something wrong with our transparent hugepage
implementation on POWER server processors as of v4.5-rc4.  I have seen
the email thread on "[BUG] random kernel crashes after THP rework on
s390 (maybe also on PowerPC and ARM)", but this doesn't seem exactly
the same as that (though it may of course be related).

I have been testing v4.5-rc4 with Aneesh's patch "powerpc/mm/hash:
Clear the invalid slot information correctly" on top, on a KVM guest
with 160 vcpus (threads=8) and 32GB of memory backed by 16MB large
pages, running on a POWER8 machine running a 4.4.1 host kernel (20
cores * 8 threads, 128GB of RAM).  The guest kernel is compiled with
THP enabled and set to "always" (i.e. not "madvise").

On this setup, when doing something like a large kernel compile, I see
random segfaults happening (in gcc, cc1, sh, etc.).  I also see bursts
of messages like this on the host console:

[50957.570859] Harmless Hypervisor Maintenance interrupt [Recovered]
[50957.570864]  Error detail: Processor Recovery done
[50957.570869]  HMER: 2040000000000000

and once I saw an unrecoverable HMI that crashed the host.  I don't
see anything like this with a 4.3.0 kernel, and the same kernel
compile proceeds flawlessly.

One thing I discovered when debugging is that we are getting to
flush_hash_hugepage() with no slot array available
(i.e. get_hpte_slot_array(pmdp) returns NULL).  So I added code to do
an explicit search for any HPTEs that need to be flushed.  The patch
to do this is below.  For the pSeries LPAR case (which this is) I also
print a message with the number of HPTEs that were found.  When
running with this patch, I see the message printed out from time to
time, like this:

pSeries_lpar_hugepage_invalidate: no slot array, found 25 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 1 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 26 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 3 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 13 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 9 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 2 HPTEs
grep thp /proc/vmstat 
thp_fault_alloc 3253
thp_fault_fallback 0
thp_collapse_alloc 5
thp_collapse_alloc_failed 0
thp_split_page 0
thp_split_page_failed 0
thp_split_pmd 9
thp_zero_page_alloc 1
thp_zero_page_alloc_failed 0
[root@dyn386 ~]# pSeries_lpar_hugepage_invalidate: no slot array, found 8 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 29 HPTEs
pSeries_lpar_hugepage_invalidate: no slot array, found 196 HPTEs
gcc[58246]: unhandled signal 11 at 0000000000000008 nip 00003fff7a7d5a08 lr 
00003fff7a7d57f4 code 30001
pSeries_lpar_hugepage_invalidate: no slot array, found 256 HPTEs

So in fact there are HPTEs to be invalidated in the cases where we
don't have the slot array - which could be an explanation for apparent
memory corruption.  However, doing the invalidations doesn't fix the
problems.  (Possibly there is a bug in my patch and we are still not
invalidating the HPTEs correctly.)

Interestingly, whenever the "no slot array" message happens, I
immediately see a burst of the HMI interrupt messages on the host
console.  So whatever we are doing wrong is creating a situation where
the CPU thinks it has encountered an error and needs to recover.

Thoughts? anyone?

Paul.
---
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 8eaac81..a433e16 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -339,22 +339,28 @@ static long native_hpte_find(unsigned long vpn, int 
psize, int ssize)
        struct hash_pte *hptep;
        unsigned long hash;
        unsigned long i;
-       long slot;
+       long slot, tries;
        unsigned long want_v, hpte_v;
+       unsigned long mask;
 
        hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-       want_v = hpte_encode_avpn(vpn, psize, ssize);
+       want_v = hpte_encode_avpn(vpn, psize, ssize) | HPTE_V_VALID;
+       mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
 
-       /* Bolted mappings are only ever in the primary group */
+       /* Search the primary group then the secondary */
        slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-               hptep = htab_address + slot;
-               hpte_v = be64_to_cpu(hptep->v);
+       for (tries = 0; tries < 2; ++tries) {
+               for (i = 0; i < HPTES_PER_GROUP; i++) {
+                       hptep = htab_address + slot;
+                       hpte_v = be64_to_cpu(hptep->v);
 
-               if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-                       /* HPTE matches */
-                       return slot;
-               ++slot;
+                       if ((hpte_v & mask) == want_v)
+                               /* HPTE matches */
+                               return slot;
+                       ++slot;
+               }
+               want_v |= HPTE_V_SECONDARY;
+               slot = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
        }
 
        return -1;
@@ -448,20 +454,27 @@ static void native_hugepage_invalidate(unsigned long vsid,
 
        local_irq_save(flags);
        for (i = 0; i < max_hpte_count; i++) {
-               valid = hpte_valid(hpte_slot_array, i);
-               if (!valid)
-                       continue;
-               hidx =  hpte_hash_index(hpte_slot_array, i);
-
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
                vpn = hpt_vpn(addr, vsid, ssize);
-               hash = hpt_hash(vpn, shift, ssize);
-               if (hidx & _PTEIDX_SECONDARY)
-                       hash = ~hash;
+               if (hpte_slot_array) {
+                       valid = hpte_valid(hpte_slot_array, i);
+                       if (!valid)
+                               continue;
+                       hidx =  hpte_hash_index(hpte_slot_array, i);
 
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += hidx & _PTEIDX_GROUP_IX;
+                       hash = hpt_hash(vpn, shift, ssize);
+                       if (hidx & _PTEIDX_SECONDARY)
+                               hash = ~hash;
+
+                       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot += hidx & _PTEIDX_GROUP_IX;
+               } else {
+                       /* no slot array, will have to search for the HPTE */
+                       slot = native_hpte_find(vpn, psize, ssize);
+                       if (slot == (unsigned long)-1)
+                               continue;
+               }
 
                hptep = htab_address + slot;
                want_v = hpte_encode_avpn(vpn, psize, ssize);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 47a0bc1..777ab6b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1348,6 +1348,13 @@ void flush_hash_hugepage(unsigned long vsid, unsigned 
long addr,
 
        s_addr = addr & HPAGE_PMD_MASK;
        hpte_slot_array = get_hpte_slot_array(pmdp);
+
+       if (ppc_md.hugepage_invalidate) {
+               ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+                                          psize, ssize, local);
+               goto tm_abort;
+       }
+
        /*
         * IF we try to do a HUGE PTE update after a withdraw is done.
         * we will find the below NULL. This happens when we do
@@ -1356,13 +1363,8 @@ void flush_hash_hugepage(unsigned long vsid, unsigned 
long addr,
        if (!hpte_slot_array)
                return;
 
-       if (ppc_md.hugepage_invalidate) {
-               ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
-                                          psize, ssize, local);
-               goto tm_abort;
-       }
        /*
-        * No bluk hpte removal support, invalidate each entry
+        * No bulk hpte removal support, invalidate each entry
         */
        shift = mmu_psize_defs[psize].shift;
        max_hpte_count = HPAGE_PMD_SIZE >> shift;
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 477290a..142e669 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -323,7 +323,10 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, 
unsigned long hpte_gr
                unsigned long pteh;
                unsigned long ptel;
        } ptes[4];
+       unsigned long mask;
 
+       mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_VALID;
+       want_v |= HPTE_V_VALID;
        for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
 
                lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
@@ -331,8 +334,7 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, 
unsigned long hpte_gr
                        continue;
 
                for (j = 0; j < 4; j++) {
-                       if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
-                           (ptes[j].pteh & HPTE_V_VALID))
+                       if ((ptes[j].pteh & mask) == want_v)
                                return i + j;
                }
        }
@@ -350,11 +352,17 @@ static long pSeries_lpar_hpte_find(unsigned long vpn, int 
psize, int ssize)
        hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
        want_v = hpte_encode_avpn(vpn, psize, ssize);
 
-       /* Bolted entries are always in the primary group */
        hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
        slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
-       if (slot < 0)
-               return -1;
+       if (slot < 0) {
+               /* try the secondary group */
+               hash = ~hash;
+               want_v |= HPTE_V_SECONDARY;
+               hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+               if (slot < 0)
+                       return -1;
+       }
        return hpte_group + slot;
 }
 
@@ -451,37 +459,46 @@ static void pSeries_lpar_hugepage_invalidate(unsigned 
long vsid,
                                             unsigned char *hpte_slot_array,
                                             int psize, int ssize, int local)
 {
-       int i, index = 0;
+       long i, index = 0;
        unsigned long s_addr = addr;
        unsigned int max_hpte_count, valid;
        unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
        unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
        unsigned long shift, hidx, vpn = 0, hash, slot;
+       unsigned long count = 0;
 
        shift = mmu_psize_defs[psize].shift;
        max_hpte_count = 1U << (PMD_SHIFT - shift);
 
        for (i = 0; i < max_hpte_count; i++) {
-               valid = hpte_valid(hpte_slot_array, i);
-               if (!valid)
-                       continue;
-               hidx =  hpte_hash_index(hpte_slot_array, i);
-
                /* get the vpn */
-               addr = s_addr + (i * (1ul << shift));
+               addr = s_addr + (i << shift);
                vpn = hpt_vpn(addr, vsid, ssize);
-               hash = hpt_hash(vpn, shift, ssize);
-               if (hidx & _PTEIDX_SECONDARY)
-                       hash = ~hash;
+               if (hpte_slot_array) {
+                       valid = hpte_valid(hpte_slot_array, i);
+                       if (!valid)
+                               continue;
+                       hidx =  hpte_hash_index(hpte_slot_array, i);
 
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += hidx & _PTEIDX_GROUP_IX;
+                       hash = hpt_hash(vpn, shift, ssize);
+                       if (hidx & _PTEIDX_SECONDARY)
+                               hash = ~hash;
+
+                       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                       slot += hidx & _PTEIDX_GROUP_IX;
+
+               } else {
+                       slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
+                       if (slot == (unsigned long)-1)
+                               continue;
+                       ++count;
+               }
 
                slot_array[index] = slot;
                vpn_array[index] = vpn;
                if (index == PPC64_HUGE_HPTE_BATCH - 1) {
                        /*
-                        * Now do a bluk invalidate
+                        * Now do a bulk invalidate
                         */
                        __pSeries_lpar_hugepage_invalidate(slot_array,
                                                           vpn_array,
@@ -494,6 +511,8 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long 
vsid,
        if (index)
                __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
                                                   index, psize, ssize);
+       if (!hpte_slot_array)
+               pr_info("pSeries_lpar_hugepage_invalidate: no slot array, found 
%ld HPTEs\n", count);
 }
 #else
 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to