Tamar Christina <tamar.christ...@arm.com> writes: > Hi, > > Following the discussion below here's a revised patch. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master?
Looks good functionally, just got some comments about the implementation. > @@ -14006,8 +14007,52 @@ cost_plus: > mode, MULT, 1, speed); > return true; > } > + break; > + case CONST_VECTOR: > + { > + rtx gen_insn = aarch64_simd_make_constant (x, true); > + /* Not a valid const vector. */ > + if (!gen_insn) > + break; > > - /* Fall through. */ > + switch (GET_CODE (gen_insn)) > + { > + case CONST_VECTOR: > + /* Load using MOVI/MVNI. */ > + if (aarch64_simd_valid_immediate (x, NULL)) > + *cost += extra_cost->vect.movi; > + else /* Load using constant pool. */ > + *cost += extra_cost->ldst.load; > + break; > + /* Load using a DUP. */ > + case VEC_DUPLICATE: > + gcc_unreachable (); > + break; > + default: > + *cost += extra_cost->ldst.load; > + break; > + } > + return true; > + } This might be a problem (if it is a problem) with some of the existing cases too, but: is using += rather than = the right behaviour here? It maens that we add our cost on top of whatever the target-independent rtx_costs thought was a good default choice, whereas it looks like these table entries specify the correct full cost. If it's not clear-cut, then I think using = would be better. Also, going back to an earlier part of the thread, I think the “inner” CONST_VECTOR case is now a correct replacement for the “outer” CONST_VECTOR case, meaning we don't need the aarch64_simd_make_constant bits. I.e. I think we can make the top-level case: case CONST_VECTOR: /* Load using MOVI/MVNI. */ if (aarch64_simd_valid_immediate (x, NULL)) *cost = extra_cost->vect.movi; else /* Load using constant pool. */ *cost = extra_cost->ldst.load; break; > + case VEC_CONCAT: > + /* depending on the operation, either DUP or INS. > + For now, keep default costing. */ > + break; > + case VEC_DUPLICATE: > + *cost += extra_cost->vect.dup; > + return true; For this I think we should do: *cost = extra_cost->vect.dup; return false; so that we cost the operand of the vec_duplicate as well. This will have no effect if the operand is a REG, but would affect more complex expressions. > + case VEC_SELECT: > + { Here I think we should recurse on operand 0: rtx op0 = XEXP (x, 0); *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); > + /* cost subreg of 0 as free, otherwise as DUP */ > + rtx op1 = XEXP (x, 1); > + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) > + ; > + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) > + *cost += extra_cost->vect.dup; > + else > + *cost += extra_cost->vect.extract; > + return true; > + } > default: > break; > } > @@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment > (machine_mode mode, > > /* If VALS is a vector constant that can be loaded into a register > using DUP, generate instructions to do so and return an RTX to > - assign to the register. Otherwise return NULL_RTX. */ > + assign to the register. Otherwise return NULL_RTX. > + > + If CHECK then the resulting instruction may not be used in > + codegen but can be used for costing. */ > static rtx > -aarch64_simd_dup_constant (rtx vals) > +aarch64_simd_dup_constant (rtx vals, bool check = false) > { > machine_mode mode = GET_MODE (vals); > machine_mode inner_mode = GET_MODE_INNER (mode); > @@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals) > /* We can load this constant by using DUP and a constant in a > single ARM register. This will be cheaper than a vector > load. */ > - x = copy_to_mode_reg (inner_mode, x); > + if (!check) > + x = copy_to_mode_reg (inner_mode, x); > return gen_vec_duplicate (mode, x); > } > > @@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals) > /* Generate code to load VALS, which is a PARALLEL containing only > constants (for vec_init) or CONST_VECTOR, efficiently into a > register. Returns an RTX to copy into the register, or NULL_RTX > - for a PARALLEL that cannot be converted into a CONST_VECTOR. */ > + for a PARALLEL that cannot be converted into a CONST_VECTOR. > + > + If CHECK then the resulting instruction may not be used in > + codegen but can be used for costing. */ > static rtx > -aarch64_simd_make_constant (rtx vals) > +aarch64_simd_make_constant (rtx vals, bool check = false) > { > machine_mode mode = GET_MODE (vals); > rtx const_dup; > @@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals) > && aarch64_simd_valid_immediate (const_vec, NULL)) > /* Load using MOVI/MVNI. */ > return const_vec; > - else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) > + else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX) > /* Loaded using DUP. */ > return const_dup; > else if (const_vec != NULL_RTX) With the inner CONST_VECTOR case replacing the outer one, I think we can drop the aarch64_simd_dup_constant and aarch64_simd_make_constant bits. > diff --git a/gcc/config/arm/aarch-common-protos.h > b/gcc/config/arm/aarch-common-protos.h > index > 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 > 100644 > --- a/gcc/config/arm/aarch-common-protos.h > +++ b/gcc/config/arm/aarch-common-protos.h > @@ -133,6 +133,9 @@ struct vector_cost_table > { > const int alu; > const int mult; > + const int movi; > + const int dup; > + const int extract; > }; > > struct cpu_cost_table > diff --git a/gcc/config/arm/aarch-cost-tables.h > b/gcc/config/arm/aarch-cost-tables.h > index > 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef > 100644 > --- a/gcc/config/arm/aarch-cost-tables.h > +++ b/gcc/config/arm/aarch-cost-tables.h > @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (0), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (2), /* alu. */ > - COSTS_N_INSNS (8) /* mult. */ > + COSTS_N_INSNS (8), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns > -fno-schedule-insns2 -mcmodel=small" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ Could you try this with -mabi=ilp32? It looks like it might fail. Skipping it is OK if so. OK with those changes, if they work. Thanks, Richard > + > +#include <arm_neon.h> > + > +/* > +**test1: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL}; > + uint64_t res = a | arr[0]; > + uint64x2_t val = vld1q_u64 (arr); > + *rt = vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test2: > +** adrp x[0-9]+, .LC[0-1]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL); > + uint64_t arr = vgetq_lane_u64 (val, 0); > + uint64_t res = a | arr; > + *rt = vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test3: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** str q[0-9]+, \[x1\] > +** fmov w[0-9]+, s[0-9]+ > +** orr w[0-9]+, w[0-9]+, w[0-9]+ > +** ret > +*/ > + > +uint32_t > +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) > +{ > + uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 }; > + uint32_t res = a | arr[0]; > + uint32x4_t val = vld1q_u32 (arr); > + *rt = vaddq_u32 (val, b); > + return res; > +} > + > +/* > +**test4: > +** ushr v[0-9]+.16b, v[0-9]+.16b, 7 > +** mov x[0-9]+, 16512 > +** movk x[0-9]+, 0x1020, lsl 16 > +** movk x[0-9]+, 0x408, lsl 32 > +** movk x[0-9]+, 0x102, lsl 48 > +** fmov d[0-9]+, x[0-9]+ > +** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d > +** dup v[0-9]+.2d, v[0-9]+.d\[0\] > +** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d > +** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b > +** umov w[0-9]+, v[0-9]+.h\[3\] > +** ret > +*/ > + > +uint64_t > +test4 (uint8x16_t input) > +{ > + uint8x16_t bool_input = vshrq_n_u8(input, 7); > + poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); > + poly64_t prodL = > vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), > + vgetq_lane_p64(mask, 0)); > + poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); > + uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); > + return vget_lane_u16((uint16x4_t)res, 3); > +} > +