Hi, This patch enables -fprefetch-loop-arrays for -mcpu=thunderxt88 and -mcpu=thunderxt88p1. I filled out the tuning structures for both thunderx and thunderx2t99. No other core current enables software prefetching so I set them to 0 which does not change the default parameters.
OK? Bootstrapped and tested on both ThunderX2 CN99xx and ThunderX CN88xx with no regressions. I got a 2x improvement for 462.libquantum on CN88xx, overall a 10% improvement on SPEC INT on CN88xx at -Ofast. CN99xx's SPEC did not change. Thanks, Andrew Pinski ChangeLog: * config/aarch64/aarch64-protos.h (struct tune_params): Add prefetch_latency, simultaneous_prefetches, l1_cache_size, and l2_cache_size fields. (enum aarch64_autoprefetch_model): Add AUTOPREFETCHER_SW. * config/aarch64/aarch64.c (generic_tunings): Update to include prefetch_latency, simultaneous_prefetches, l1_cache_size, and l2_cache_size fields to 0. (cortexa35_tunings): Likewise. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (cortexa72_tunings): Likewise. (cortexa73_tunings): Likewise. (exynosm1_tunings): Likewise. (thunderx_tunings): Fill out some of the new fields. (thunderxt88_tunings): New variable. (xgene1_tunings): Update to include prefetch_latency, simultaneous_prefetches, l1_cache_size, and l2_cache_size fields to 0. (qdf24xx_tunings): Likewise. (thunderx2t99_tunings): Fill out some of the new fields. (aarch64_override_options_internal): Consider AUTOPREFETCHER_SW like AUTOPREFETCHER_OFF. Set param values if the fields are non-zero. Turn on prefetch-loop-arrays if AUTOPREFETCHER_SW and optimize level is at least 3 or profile feed usage is enabled. * config/aarch64/aarch64-cores.def (thunderxt88p1): Use thunderxt88 tuning. (thunderxt88): Likewise.
Index: config/aarch64/aarch64-cores.def =================================================================== --- config/aarch64/aarch64-cores.def (revision 244917) +++ config/aarch64/aarch64-cores.def (working copy) @@ -63,8 +63,8 @@ AARCH64_CORE("qdf24xx", qdf24xx, c AARCH64_CORE("thunderx", thunderx, thunderx, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx, 0x43, 0x0a0, -1) /* Do not swap around "thunderxt88p1" and "thunderxt88", this order is required to handle variant correctly. */ -AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx, 0x43, 0x0a1, 0) -AARCH64_CORE("thunderxt88", thunderxt88, thunderx, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx, 0x43, 0x0a1, -1) +AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderxt88, 0x43, 0x0a1, 0) +AARCH64_CORE("thunderxt88", thunderxt88, thunderx, 8A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderxt88, 0x43, 0x0a1, -1) AARCH64_CORE("thunderxt81", thunderxt81, thunderx, 8_1A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx, 0x43, 0x0a2, -1) AARCH64_CORE("thunderxt83", thunderxt83, thunderx, 8_1A, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx, 0x43, 0x0a3, -1) Index: config/aarch64/aarch64-protos.h =================================================================== --- config/aarch64/aarch64-protos.h (revision 244917) +++ config/aarch64/aarch64-protos.h (working copy) @@ -220,10 +220,19 @@ struct tune_params unsigned int max_case_values; /* Value for PARAM_L1_CACHE_LINE_SIZE; or 0 to use the default. */ unsigned int cache_line_size; + /* Value for PARAM_PREFETCH_LATENCY; or 0 to use the default. */ + unsigned int prefetch_latency; + /* Value for PARAM_SIMULTANEOUS_PREFETCHES; or 0 to use the default. */ + unsigned int simultaneous_prefetches; + /* Value for PARAM_L1_CACHE_SIZE; or 0 to use the default. */ + unsigned int l1_cache_size; + /* Value for PARAM_L2_CACHE_SIZE; or 0 to use the default. */ + unsigned int l2_cache_size; /* An enum specifying how to take into account CPU autoprefetch capabilities during instruction scheduling: - AUTOPREFETCHER_OFF: Do not take autoprefetch capabilities into account. + - AUTOPREFETCHER_SW: Turn on software based prefetching. - AUTOPREFETCHER_WEAK: Attempt to sort sequences of loads/store in order of offsets but allow the pipeline hazard recognizer to alter that order to maximize multi-issue opportunities. @@ -233,6 +242,7 @@ struct tune_params enum aarch64_autoprefetch_model { AUTOPREFETCHER_OFF, + AUTOPREFETCHER_SW, AUTOPREFETCHER_WEAK, AUTOPREFETCHER_STRONG } autoprefetcher_model; Index: config/aarch64/aarch64.c =================================================================== --- config/aarch64/aarch64.c (revision 244917) +++ config/aarch64/aarch64.c (working copy) @@ -535,6 +535,10 @@ static const struct tune_params generic_ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -561,6 +565,10 @@ static const struct tune_params cortexa3 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -587,6 +595,10 @@ static const struct tune_params cortexa5 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -613,6 +625,10 @@ static const struct tune_params cortexa5 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ }; @@ -639,6 +655,10 @@ static const struct tune_params cortexa7 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -665,6 +685,10 @@ static const struct tune_params cortexa7 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -690,6 +714,10 @@ static const struct tune_params exynosm1 2, /* min_div_recip_mul_df. */ 48, /* max_case_values. */ 64, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -714,11 +742,45 @@ static const struct tune_params thunderx 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ + 128, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 8, /* simultaneous_prefetches. */ + 32, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */ }; +/* Tunings for ThunderX CN88xx */ +static const struct tune_params thunderxt88_tunings = +{ + &thunderx_extra_costs, + &generic_addrcost_table, + &thunderx_regmove_cost, + &thunderx_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + 6, /* memmov_cost */ + 2, /* issue_rate */ + AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */ + 8, /* function_align. */ + 8, /* jump_align. */ + 8, /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + 128, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 8, /* simultaneous_prefetches. */ + 32, /* l1_cache_size. */ + 16*1024, /* l2_cache_size. */ + tune_params::AUTOPREFETCHER_SW, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */ +}; + static const struct tune_params xgene1_tunings = { &xgene1_extra_costs, @@ -740,6 +802,10 @@ static const struct tune_params xgene1_t 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 0, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -766,6 +832,10 @@ static const struct tune_params qdf24xx_ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 64, /* cache_line_size. */ + 0, /* prefetch_latency. */ + 0, /* simultaneous_prefetches. */ + 0, /* l1_cache_size. */ + 0, /* l2_cache_size. */ tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -791,7 +861,11 @@ static const struct tune_params thunderx 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ 64, /* cache_line_size. */ - tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ + 0, /* prefetch_latency. */ + 8, /* simultaneous_prefetches. */ + 32, /* l1_cache_size. */ + 256, /* l2_cache_size. */ + tune_params::AUTOPREFETCHER_NONE, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ }; @@ -8646,6 +8720,7 @@ aarch64_override_options_internal (struc switch (aarch64_tune_params.autoprefetcher_model) { case tune_params::AUTOPREFETCHER_OFF: + case tune_params::AUTOPREFETCHER_SW: queue_depth = -1; break; case tune_params::AUTOPREFETCHER_WEAK: @@ -8672,6 +8747,42 @@ aarch64_override_options_internal (struc opts->x_param_values, global_options_set.x_param_values); + /* Set the prefetch latncy. */ + if (selected_cpu->tune->prefetch_latency != 0) + maybe_set_param_value (PARAM_PREFETCH_LATENCY, + selected_cpu->tune->prefetch_latency, + opts->x_param_values, + global_options_set.x_param_values); + + /* Set the simultaneous prefetches. */ + if (selected_cpu->tune->simultaneous_prefetches != 0) + maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, + selected_cpu->tune->simultaneous_prefetches, + opts->x_param_values, + global_options_set.x_param_values); + + /* Set the l1 cache size. */ + if (selected_cpu->tune->l1_cache_size != 0) + maybe_set_param_value (PARAM_L1_CACHE_SIZE, + selected_cpu->tune->l1_cache_size, + opts->x_param_values, + global_options_set.x_param_values); + + /* Set the l2 cache size. */ + if (selected_cpu->tune->l2_cache_size != 0) + maybe_set_param_value (PARAM_L2_CACHE_SIZE, + selected_cpu->tune->l2_cache_size, + opts->x_param_values, + global_options_set.x_param_values); + + + /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ + if (opts->x_flag_prefetch_loop_arrays < 0 + && (opts->x_optimize >= 3 || opts->x_flag_profile_use) + && !opts->x_optimize_size + && aarch64_tune_params.autoprefetcher_model == tune_params::AUTOPREFETCHER_SW) + opts->x_flag_prefetch_loop_arrays = 1; + aarch64_override_options_after_change_1 (opts); }