[PATCH/AARCH64] Enable software prefetching (-fprefetch-loop-arrays) for ThunderX 88xxx

Andrew Pinski Thu, 26 Jan 2017 12:56:54 -0800

Hi,
  This patch enables -fprefetch-loop-arrays for -mcpu=thunderxt88 and
-mcpu=thunderxt88p1.  I filled out the tuning structures for both
thunderx and thunderx2t99.  No other core current enables software
prefetching so I set them to 0 which does not change the default
parameters.


OK?  Bootstrapped and tested on both ThunderX2 CN99xx and ThunderX
CN88xx with no regressions.  I got a 2x improvement for 462.libquantum
on CN88xx, overall a 10% improvement on SPEC INT on CN88xx at -Ofast.
CN99xx's SPEC did not change.

Thanks,
Andrew Pinski

ChangeLog:
* config/aarch64/aarch64-protos.h (struct tune_params): Add
prefetch_latency, simultaneous_prefetches, l1_cache_size, and
l2_cache_size fields.
(enum aarch64_autoprefetch_model): Add AUTOPREFETCHER_SW.
* config/aarch64/aarch64.c (generic_tunings): Update to include
prefetch_latency, simultaneous_prefetches, l1_cache_size, and
l2_cache_size fields to 0.
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(cortexa73_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Fill out some of the new fields.
(thunderxt88_tunings): New variable.
(xgene1_tunings): Update to include prefetch_latency,
simultaneous_prefetches, l1_cache_size, and l2_cache_size fields to 0.
(qdf24xx_tunings): Likewise.
(thunderx2t99_tunings): Fill out some of the new fields.
(aarch64_override_options_internal): Consider AUTOPREFETCHER_SW like
AUTOPREFETCHER_OFF.
Set param values if the fields are non-zero.  Turn on
prefetch-loop-arrays if AUTOPREFETCHER_SW and optimize level is at
least 3 or profile feed usage is enabled.
* config/aarch64/aarch64-cores.def (thunderxt88p1): Use thunderxt88 tuning.
(thunderxt88): Likewise.

Index: config/aarch64/aarch64-cores.def
===================================================================
--- config/aarch64/aarch64-cores.def    (revision 244917)
+++ config/aarch64/aarch64-cores.def    (working copy)
@@ -63,8 +63,8 @@ AARCH64_CORE("qdf24xx",     qdf24xx,   c
 AARCH64_CORE("thunderx",      thunderx,      thunderx,  8A,    
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, 
thunderx,  0x43, 0x0a0, -1)
 /* Do not swap around "thunderxt88p1" and "thunderxt88",
    this order is required to handle variant correctly. */
-AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx,  8A,    
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,                 
thunderx,  0x43, 0x0a1, 0)
-AARCH64_CORE("thunderxt88",   thunderxt88,   thunderx,  8A,    
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, 
thunderx,  0x43, 0x0a1, -1)
+AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx,  8A,    
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,                 
thunderxt88,  0x43, 0x0a1, 0)
+AARCH64_CORE("thunderxt88",   thunderxt88,   thunderx,  8A,    
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, 
thunderxt88,  0x43, 0x0a1, -1)
 AARCH64_CORE("thunderxt81",   thunderxt81,   thunderx,  8_1A,  
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, 
thunderx,  0x43, 0x0a2, -1)
 AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  8_1A,  
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, 
thunderx,  0x43, 0x0a3, -1)
 
Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h     (revision 244917)
+++ config/aarch64/aarch64-protos.h     (working copy)
@@ -220,10 +220,19 @@ struct tune_params
   unsigned int max_case_values;
   /* Value for PARAM_L1_CACHE_LINE_SIZE; or 0 to use the default.  */
   unsigned int cache_line_size;
+  /* Value for PARAM_PREFETCH_LATENCY; or 0 to use the default.  */
+  unsigned int prefetch_latency;
+  /* Value for PARAM_SIMULTANEOUS_PREFETCHES; or 0 to use the default.  */
+  unsigned int simultaneous_prefetches;
+  /* Value for PARAM_L1_CACHE_SIZE; or 0 to use the default.  */
+  unsigned int l1_cache_size;
+  /* Value for PARAM_L2_CACHE_SIZE; or 0 to use the default.  */
+  unsigned int l2_cache_size;
 
 /* An enum specifying how to take into account CPU autoprefetch capabilities
    during instruction scheduling:
    - AUTOPREFETCHER_OFF: Do not take autoprefetch capabilities into account.
+   - AUTOPREFETCHER_SW: Turn on software based prefetching.
    - AUTOPREFETCHER_WEAK: Attempt to sort sequences of loads/store in order of
    offsets but allow the pipeline hazard recognizer to alter that order to
    maximize multi-issue opportunities.
@@ -233,6 +242,7 @@ struct tune_params
   enum aarch64_autoprefetch_model
   {
     AUTOPREFETCHER_OFF,
+    AUTOPREFETCHER_SW,
     AUTOPREFETCHER_WEAK,
     AUTOPREFETCHER_STRONG
   } autoprefetcher_model;
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c    (revision 244917)
+++ config/aarch64/aarch64.c    (working copy)
@@ -535,6 +535,10 @@ static const struct tune_params generic_
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -561,6 +565,10 @@ static const struct tune_params cortexa3
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -587,6 +595,10 @@ static const struct tune_params cortexa5
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -613,6 +625,10 @@ static const struct tune_params cortexa5
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags.  */
 };
@@ -639,6 +655,10 @@ static const struct tune_params cortexa7
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -665,6 +685,10 @@ static const struct tune_params cortexa7
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -690,6 +714,10 @@ static const struct tune_params exynosm1
   2,   /* min_div_recip_mul_df.  */
   48,  /* max_case_values.  */
   64,  /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 };
@@ -714,11 +742,45 @@ static const struct tune_params thunderx
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
-  0,   /* cache_line_size.  */
+  128, /* cache_line_size.  */
+  0,    /* prefetch_latency. */
+  8,    /* simultaneous_prefetches. */
+  32,   /* l1_cache_size. */
+  0,     /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)     /* tune_flags.  */
 };
 
+/* Tunings for ThunderX CN88xx */
+static const struct tune_params thunderxt88_tunings =
+{
+  &thunderx_extra_costs,
+  &generic_addrcost_table,
+  &thunderx_regmove_cost,
+  &thunderx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  6, /* memmov_cost  */
+  2, /* issue_rate  */
+  AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
+  8,    /* function_align.  */
+  8,    /* jump_align.  */
+  8,    /* loop_align.  */
+  2,    /* int_reassoc_width.  */
+  4,    /* fp_reassoc_width.  */
+  1,    /* vec_reassoc_width.  */
+  2,    /* min_div_recip_mul_sf.  */
+  2,    /* min_div_recip_mul_df.  */
+  0,    /* max_case_values.  */
+  128,    /* cache_line_size.  */
+  0,    /* prefetch_latency. */
+  8,    /* simultaneous_prefetches. */
+  32,   /* l1_cache_size. */
+  16*1024,     /* l2_cache_size. */
+  tune_params::AUTOPREFETCHER_SW,      /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
+};
+
 static const struct tune_params xgene1_tunings =
 {
   &xgene1_extra_costs,
@@ -740,6 +802,10 @@ static const struct tune_params xgene1_t
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   0,   /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
@@ -766,6 +832,10 @@ static const struct tune_params qdf24xx_
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   64,  /* cache_line_size.  */
+  0,   /* prefetch_latency. */
+  0,   /* simultaneous_prefetches. */
+  0,   /* l1_cache_size. */
+  0,   /* l2_cache_size. */
   tune_params::AUTOPREFETCHER_STRONG,  /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)            /* tune_flags.  */
 };
@@ -791,7 +861,11 @@ static const struct tune_params thunderx
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   64,  /* cache_line_size.  */
-  tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
+  0,   /* prefetch_latency. */
+  8,   /* simultaneous_prefetches. */
+  32,  /* l1_cache_size. */
+  256, /* l2_cache_size. */
+  tune_params::AUTOPREFETCHER_NONE,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
 };
 
@@ -8646,6 +8720,7 @@ aarch64_override_options_internal (struc
   switch (aarch64_tune_params.autoprefetcher_model)
     {
       case tune_params::AUTOPREFETCHER_OFF:
+      case tune_params::AUTOPREFETCHER_SW:
        queue_depth = -1;
        break;
       case tune_params::AUTOPREFETCHER_WEAK:
@@ -8672,6 +8747,42 @@ aarch64_override_options_internal (struc
                           opts->x_param_values,
                           global_options_set.x_param_values);
 
+  /* Set the prefetch latncy.  */
+  if (selected_cpu->tune->prefetch_latency != 0)
+    maybe_set_param_value (PARAM_PREFETCH_LATENCY,
+                          selected_cpu->tune->prefetch_latency,
+                          opts->x_param_values,
+                          global_options_set.x_param_values);
+
+  /* Set the simultaneous prefetches.  */
+  if (selected_cpu->tune->simultaneous_prefetches != 0)
+    maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+                          selected_cpu->tune->simultaneous_prefetches,
+                          opts->x_param_values,
+                          global_options_set.x_param_values);
+
+  /* Set the l1 cache size.  */
+  if (selected_cpu->tune->l1_cache_size != 0)
+    maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+                          selected_cpu->tune->l1_cache_size,
+                          opts->x_param_values,
+                          global_options_set.x_param_values);
+
+  /* Set the l2 cache size.  */
+  if (selected_cpu->tune->l2_cache_size != 0)
+    maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+                          selected_cpu->tune->l2_cache_size,
+                          opts->x_param_values,
+                          global_options_set.x_param_values);
+
+
+  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
+  if (opts->x_flag_prefetch_loop_arrays < 0
+      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+      && !opts->x_optimize_size
+      && aarch64_tune_params.autoprefetcher_model == 
tune_params::AUTOPREFETCHER_SW)
+    opts->x_flag_prefetch_loop_arrays = 1;
+
   aarch64_override_options_after_change_1 (opts);
 }

[PATCH/AARCH64] Enable software prefetching (-fprefetch-loop-arrays) for ThunderX 88xxx

Reply via email to