This patch adds support for TARGET_RTX_COSTS to the nvptx backend. Currently, nvptx uses GCC's default instruction timing estimates, but this patch provides (slightly) more accurate timings. The most significant difference is that integer division is much slower (relatively) than other instructions, so the compiler should be making more use of the middle-end's expand_divmod.
For an example of the benefit consider: int foo(unsigned int x) { return x/10; } currently with -O2 we generate: .visible .func (.param.u32 %value_out) foo (.param.u32 %in_ar0) { .reg.u32 %value; .reg.u32 %ar0; ld.param.u32 %ar0, [%in_ar0]; .reg.u32 %r24; mov.u32 %r24, %ar0; div.u32 %value, %r24, 10; st.param.u32 [%value_out], %value; ret; } but with this patch, we now generate: .visible .func (.param.u32 %value_out) foo (.param.u32 %in_ar0) { .reg.u32 %value; .reg.u32 %ar0; ld.param.u32 %ar0, [%in_ar0]; .reg.u32 %r24; .reg.u32 %r26; mov.u32 %r24, %ar0; mul.hi.u32 %r26, %r24, -858993459; shr.u32 %value, %r26, 3; st.param.u32 [%value_out], %value; ret; } The performance benefits can be seen/measured by the attached microbenchmark, bench.c, when run with nvptx-none-run-single. Before: result = 266546680000 19004366269 ticks 15.203493 seconds After: result = 266546680000 5153988012 ticks 4.123190 seconds So about a 3.7x performance improvement. This patch has been tested with make and make -k check for nvptx-none hosted on x86_64-pc-linux-gnu with no new failures. Ok for mainline? 2024-07-11 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * config/nvptx/nvptx.cc (nvptx_rtx_size_costs): New function to estimate the size of an RTX expression (in ptxas instructions). (nvptx_rtx_costs): Implementation of rtx_costs target hook. (TARGET_RTX_COSTS): Define to nvptx_rtx_costs. gcc/testsuite/ChangeLog * gcc.target/nvptx/div10.c: New test case. Thanks in advance, Roger --
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 2a8f713..5ae2a76 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -7511,6 +7511,132 @@ nvptx_goacc_expand_var_decl (tree var) return NULL_RTX; } +/* Helper function of nvptx_rtx_costs. + Assume each ptxas instruction has the same size. */ + +static bool +nvptx_rtx_size_costs (rtx x, machine_mode mode, int outer_code, int *total) +{ + int code = GET_CODE (x); + + switch (code) + { + case PLUS: + case MINUS: + case MULT: + case DIV: + case MOD: + case FMA: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode + || mode == SFmode || mode == DFmode) + *total = COSTS_N_INSNS (1); + break; + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ZERO_EXTEND: + case UDIV: + case UMOD: + case ABS: + case POPCOUNT: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode) + *total = COSTS_N_INSNS (1); + break; + case SUBREG: + case TRUNCATE: + *total = COSTS_N_INSNS (1); + break; + case REG: + case CONST_INT: + case CONST_DOUBLE: + if (outer_code == SET) + *total = COSTS_N_INSNS (1); + break; + } + return false; +} + +/* Implement TARGET_RTX_COSTS. */ + +static bool +nvptx_rtx_costs (rtx x, machine_mode mode, int outer_code, + int opno ATTRIBUTE_UNUSED, int *total, bool speed_p) +{ + if (! speed_p) + return nvptx_rtx_size_costs (x, mode, outer_code, total); + + int code = GET_CODE (x); + + switch (code) + { + case PLUS: + case MINUS: + case NEG: + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ZERO_EXTEND: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode + || mode == SFmode || mode == DFmode) + *total = COSTS_N_INSNS (1); + break; + case MULT: + case FMA: + case UMUL_HIGHPART: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode + || mode == SFmode || mode == DFmode) + *total = COSTS_N_INSNS (2); + break; + case DIV: + case MOD: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode) + *total = COSTS_N_INSNS (25); + else if (mode == SFmode) + *total = COSTS_N_INSNS (64); + else if (mode == DFmode) + *total = COSTS_N_INSNS (90); + break; + case UDIV: + case UMOD: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode) + *total = COSTS_N_INSNS (24); + break; + case ABS: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode) + *total = COSTS_N_INSNS (2); + break; + case POPCOUNT: + if (mode == SImode || mode == DImode + || mode == HImode || mode == QImode) + *total = COSTS_N_INSNS (4); + break; + case SUBREG: + case TRUNCATE: + *total = COSTS_N_INSNS (1); + break; + case REG: + case CONST_INT: + case CONST_DOUBLE: + *total = (outer_code == SET) ? COSTS_N_INSNS (1) : 0; + return true; + case UNSPEC: + if (XINT (x, 1) == UNSPEC_ARG_REG) + { + *total = (outer_code == SET) ? COSTS_N_INSNS (1) : 0; + return true; + } + break; + } + return false; +} + static GTY(()) tree nvptx_previous_fndecl; static void @@ -7786,6 +7912,9 @@ nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value) #undef TARGET_GOACC_EXPAND_VAR_DECL #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl +#undef TARGET_RTX_COSTS +#define TARGET_RTX_COSTS nvptx_rtx_costs + #undef TARGET_SET_CURRENT_FUNCTION #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function diff --git a/gcc/testsuite/gcc.target/nvptx/div10.c b/gcc/testsuite/gcc.target/nvptx/div10.c new file mode 100644 index 0000000..fce61b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/div10.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +int foo(unsigned int x) +{ + return x/10; +} + +/* { dg-final { scan-assembler "mul.hi.u32" } } */ +/* { dg-final { scan-assembler "shr.u32" } } */ +/* { dg-final { scan-assembler-not "div.u32" } } */
#include <stdio.h> unsigned long bench() { unsigned long total = 0; for (unsigned int i=0; i<20000; i++) for (unsigned int j=0;j<i; j++) total += j/5; return total; } // Nvidia Quadro P400 // #define NVPTX_HZ 1170e6 #define NVPTX_HZ 1250e6 inline unsigned long ticks() { unsigned long now; asm volatile("mov.u64 %0, %%clock64;" : "=r"(now)); return now; } int main() { unsigned long beg = ticks(); unsigned long result = bench(); unsigned long end = ticks(); unsigned long delta = end-beg; printf("result = %lu\n",result); printf("%lu ticks\n",delta); printf("%f seconds\n",delta/NVPTX_HZ); return 0; }