Author: kib
Date: Tue Oct  4 17:01:24 2016
New Revision: 306680
URL: https://svnweb.freebsd.org/changeset/base/306680

Log:
  Re-apply r306516 (by cem):
  
  Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags
  
  Reduce contention during TLB invalidation operations by using a per-CPU
  completion flag, rather than a single atomically-updated variable.
  
  On a Westmere system (2 sockets x 4 cores x 1 threads), dtrace measurements
  show that smp_tlb_shootdown is about 50% faster with this patch; observations
  with VTune show that the percentage of time spent in invlrng_single_page on an
  interrupt (actually doing invalidation, rather than synchronization) increases
  from 31% with the old mechanism to 71% with the new one.  (Running a basic 
file
  server workload.)
  
  Submitted by: Anton Rang <rang at acm.org>
  Reviewed by:  cem (earlier version)
  Sponsored by: Dell EMC Isilon
  Differential Revision:        https://reviews.freebsd.org/D8041

Modified:
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/include/pcpu.h
  head/sys/i386/include/pcpu.h
  head/sys/x86/include/x86_smp.h
  head/sys/x86/x86/mp_x86.c

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c   Tue Oct  4 16:44:40 2016        
(r306679)
+++ head/sys/amd64/amd64/mp_machdep.c   Tue Oct  4 17:01:24 2016        
(r306680)
@@ -409,6 +409,7 @@ void
 invltlb_invpcid_handler(void)
 {
        struct invpcid_descr d;
+       uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
        xhits_gbl[PCPU_GET(cpuid)]++;
@@ -417,17 +418,20 @@ invltlb_invpcid_handler(void)
        (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
+       generation = smp_tlb_generation;
        d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
        d.pad = 0;
        d.addr = 0;
        invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
            INVPCID_CTX);
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invltlb_pcid_handler(void)
 {
+       uint32_t generation;
+  
 #ifdef COUNT_XINVLTLB_HITS
        xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@@ -435,6 +439,7 @@ invltlb_pcid_handler(void)
        (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
+       generation = smp_tlb_generation;        /* Overlap with serialization */
        if (smp_tlb_pmap == kernel_pmap) {
                invltlb_glob();
        } else {
@@ -450,5 +455,5 @@ invltlb_pcid_handler(void)
                            smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
                }
        }
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }

Modified: head/sys/amd64/include/pcpu.h
==============================================================================
--- head/sys/amd64/include/pcpu.h       Tue Oct  4 16:44:40 2016        
(r306679)
+++ head/sys/amd64/include/pcpu.h       Tue Oct  4 17:01:24 2016        
(r306680)
@@ -65,7 +65,8 @@
        u_int   pc_vcpu_id;             /* Xen vCPU ID */               \
        uint32_t pc_pcid_next;                                          \
        uint32_t pc_pcid_gen;                                           \
-       char    __pad[149]              /* be divisor of PAGE_SIZE      \
+       uint32_t pc_smp_tlb_done;       /* TLB op acknowledgement */    \
+       char    __pad[145]              /* be divisor of PAGE_SIZE      \
                                           after cache alignment */
 
 #define        PC_DBREG_CMD_NONE       0

Modified: head/sys/i386/include/pcpu.h
==============================================================================
--- head/sys/i386/include/pcpu.h        Tue Oct  4 16:44:40 2016        
(r306679)
+++ head/sys/i386/include/pcpu.h        Tue Oct  4 17:01:24 2016        
(r306680)
@@ -59,7 +59,8 @@
        u_int   pc_cmci_mask;           /* MCx banks for CMCI */        \
        u_int   pc_vcpu_id;             /* Xen vCPU ID */               \
        vm_offset_t pc_qmap_addr;       /* KVA for temporary mappings */\
-       char    __pad[229]
+       uint32_t pc_smp_tlb_done;       /* TLB op acknowledgement */    \
+       char    __pad[225]
 
 #ifdef _KERNEL
 

Modified: head/sys/x86/include/x86_smp.h
==============================================================================
--- head/sys/x86/include/x86_smp.h      Tue Oct  4 16:44:40 2016        
(r306679)
+++ head/sys/x86/include/x86_smp.h      Tue Oct  4 17:01:24 2016        
(r306680)
@@ -35,7 +35,7 @@ extern volatile int aps_ready;
 extern struct mtx ap_boot_mtx;
 extern int cpu_logical;
 extern int cpu_cores;
-extern volatile int smp_tlb_wait;
+extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];

Modified: head/sys/x86/x86/mp_x86.c
==============================================================================
--- head/sys/x86/x86/mp_x86.c   Tue Oct  4 16:44:40 2016        (r306679)
+++ head/sys/x86/x86/mp_x86.c   Tue Oct  4 17:01:24 2016        (r306680)
@@ -1304,12 +1304,22 @@ cpususpend_handler(void)
 void
 invlcache_handler(void)
 {
+       uint32_t generation;
+
 #ifdef COUNT_IPIS
        (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
+       /*
+        * Reading the generation here allows greater parallelism
+        * since wbinvd is a serializing instruction.  Without the
+        * temporary, we'd wait for wbinvd to complete, then the read
+        * would execute, then the dependent write, whuch must then
+        * complete before return from interrupt.
+        */
+       generation = smp_tlb_generation;
        wbinvd();
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }
 
 /*
@@ -1367,7 +1377,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_
 /* Variables needed for SMP tlb shootdown. */
 static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
-volatile int smp_tlb_wait;
+volatile uint32_t smp_tlb_generation;
 
 #ifdef __amd64__
 #define        read_eflags() read_rflags()
@@ -1377,15 +1387,16 @@ static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
     vm_offset_t addr1, vm_offset_t addr2)
 {
-       int cpu, ncpu, othercpus;
-
-       othercpus = mp_ncpus - 1;       /* does not shootdown self */
+       cpuset_t other_cpus;
+       volatile uint32_t *p_cpudone;
+       uint32_t generation;
+       int cpu;
 
        /*
         * Check for other cpus.  Return if none.
         */
        if (CPU_ISFULLSET(&mask)) {
-               if (othercpus < 1)
+               if (mp_ncpus <= 1)
                        return;
        } else {
                CPU_CLR(PCPU_GET(cpuid), &mask);
@@ -1399,23 +1410,28 @@ smp_targeted_tlb_shootdown(cpuset_t mask
        smp_tlb_addr1 = addr1;
        smp_tlb_addr2 = addr2;
        smp_tlb_pmap = pmap;
-       smp_tlb_wait =  0;
+       generation = ++smp_tlb_generation;
        if (CPU_ISFULLSET(&mask)) {
-               ncpu = othercpus;
                ipi_all_but_self(vector);
+               other_cpus = all_cpus;
+               CPU_CLR(PCPU_GET(cpuid), &other_cpus);
        } else {
-               ncpu = 0;
+               other_cpus = mask;
                while ((cpu = CPU_FFS(&mask)) != 0) {
                        cpu--;
                        CPU_CLR(cpu, &mask);
                        CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
                            cpu, vector);
                        ipi_send_cpu(cpu, vector);
-                       ncpu++;
                }
        }
-       while (smp_tlb_wait < ncpu)
-               ia32_pause();
+       while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+               cpu--;
+               CPU_CLR(cpu, &other_cpus);
+               p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+               while (*p_cpudone != generation)
+                       ia32_pause();
+       }
        mtx_unlock_spin(&smp_ipi_mtx);
 }
 
@@ -1473,6 +1489,8 @@ smp_cache_flush(void)
 void
 invltlb_handler(void)
 {
+       uint32_t generation;
+  
 #ifdef COUNT_XINVLTLB_HITS
        xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@@ -1480,16 +1498,23 @@ invltlb_handler(void)
        (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
+       /*
+        * Reading the generation here allows greater parallelism
+        * since invalidating the TLB is a serializing operation.
+        */
+       generation = smp_tlb_generation;
        if (smp_tlb_pmap == kernel_pmap)
                invltlb_glob();
        else
                invltlb();
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlpg_handler(void)
 {
+       uint32_t generation;
+
 #ifdef COUNT_XINVLTLB_HITS
        xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@@ -1497,14 +1522,16 @@ invlpg_handler(void)
        (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
+       generation = smp_tlb_generation;        /* Overlap with serialization */
        invlpg(smp_tlb_addr1);
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlrng_handler(void)
 {
-       vm_offset_t addr;
+       vm_offset_t addr, addr2;
+       uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
        xhits_rng[PCPU_GET(cpuid)]++;
@@ -1514,10 +1541,12 @@ invlrng_handler(void)
 #endif /* COUNT_IPIS */
 
        addr = smp_tlb_addr1;
+       addr2 = smp_tlb_addr2;
+       generation = smp_tlb_generation;        /* Overlap with serialization */
        do {
                invlpg(addr);
                addr += PAGE_SIZE;
-       } while (addr < smp_tlb_addr2);
+       } while (addr < addr2);
 
-       atomic_add_int(&smp_tlb_wait, 1);
+       PCPU_SET(smp_tlb_done, generation);
 }
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to