> On 23 Oct 2024, at 16:40, Richard Sandiford <richard.sandif...@arm.com> wrote: > > External email: Use caution opening links or attachments > > > Jennifer Schmitz <jschm...@nvidia.com> writes: >> Because a neg instruction has lower latency and higher throughput than >> sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv, >> this is already implemented on the RTL level; for svmul, the >> optimization was still missing. >> This patch implements folding to svneg for both operations using the >> gimple_folder. For svdiv, the transform is applied if the divisor is -1. >> Svmul is folded if either of the operands is -1. A case distinction of >> the predication is made to account for the fact that svneg_m has 3 arguments >> (argument 0 holds the values for the inactive lanes), while svneg_x and >> svneg_z have only 2 arguments. >> Tests were added or adjusted to check the produced assembly and runtime >> tests were added to check correctness. >> >> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. >> OK for mainline? >> >> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > Sorry for the slow review. > >> [...] >> @@ -2033,12 +2054,37 @@ public: >> if (integer_zerop (op1) || integer_zerop (op2)) >> return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); >> >> + /* If one of the operands is all integer -1, fold to svneg. */ >> + tree pg = gimple_call_arg (f.call, 0); >> + tree negated_op = NULL; >> + if (integer_minus_onep (op2)) >> + negated_op = op1; >> + else if (integer_minus_onep (op1)) >> + negated_op = op2; >> + if (!f.type_suffix (0).unsigned_p && negated_op) > > This is definitely ok, but it would be nice to handle the unsigned_p > case too at some point. This would mean bit-casting to the equivalent > signed type, doing the negation, and casting back. It would be good > to have a helper for doing that (maybe with a lambda callback that > emits the actual call) since I can imagine it will be needed elsewhere > too. > >> [...] >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c >> index 13009d88619..1d605dbdd8d 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c >> @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_s32_m_untied, svint32_t, >> >> /* >> ** mul_m1_s32_m: >> -** mov (z[0-9]+)\.b, #-1 >> -** mul z0\.s, p0/m, z0\.s, \1\.s >> +** neg z0\.s, p0/m, z0\.s >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t, >> z0 = svmul_n_s32_m (p0, z0, -1), >> z0 = svmul_m (p0, z0, -1)) >> >> +/* >> +** mul_m1r_s32_m: >> +** mov (z[0-9]+)\.b, #-1 >> +** mov (z[0-9]+)\.d, z0\.d >> +** movprfx z0, \1 >> +** neg z0\.s, p0/m, \2\.s >> +** ret >> +*/ >> +TEST_UNIFORM_Z (mul_m1r_s32_m, svint32_t, >> + z0 = svmul_s32_m (p0, svdup_s32 (-1), z0), >> + z0 = svmul_m (p0, svdup_s32 (-1), z0)) > > Maybe it would be better to test the untied case instead, by passing > z1 rather than z0 as the final argument. Hopefully that would leave > us with just the first and last instructions. (I think the existing > tests already cover the awkward tied2 case well enough.) > > Same for the later similar tests. > > OK with that change, thanks. Thanks for the review. I adjusted the 3 test cases (mul_m1r_s32_m/_x/_z) to cover the untied version. For mul_m1r_s32_m this indeed results in MOV + NEG. I also made a note to address the unsigned_p case in a separate patch. The patch with the updated test cases was pushed to trunk: fc40202c1ac5d585bb236cdaf3a3968927e970a0 Thanks, Jennifer > > Richard > >> + >> /* >> ** mul_s32_z_tied1: >> ** movprfx z0\.s, p0/z, z0\.s >> @@ -597,13 +608,44 @@ TEST_UNIFORM_Z (mul_255_s32_x, svint32_t, >> >> /* >> ** mul_m1_s32_x: >> -** mul z0\.s, z0\.s, #-1 >> +** neg z0\.s, p0/m, z0\.s >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t, >> z0 = svmul_n_s32_x (p0, z0, -1), >> z0 = svmul_x (p0, z0, -1)) >> >> +/* >> +** mul_m1r_s32_x: >> +** neg z0\.s, p0/m, z0\.s >> +** ret >> +*/ >> +TEST_UNIFORM_Z (mul_m1r_s32_x, svint32_t, >> + z0 = svmul_s32_x (p0, svdup_s32 (-1), z0), >> + z0 = svmul_x (p0, svdup_s32 (-1), z0)) >> + >> +/* >> +** mul_m1_s32_z: >> +** mov (z[0-9]+)\.d, z0\.d >> +** movprfx z0\.s, p0/z, \1\.s >> +** neg z0\.s, p0/m, \1\.s >> +** ret >> +*/ >> +TEST_UNIFORM_Z (mul_m1_s32_z, svint32_t, >> + z0 = svmul_n_s32_z (p0, z0, -1), >> + z0 = svmul_z (p0, z0, -1)) >> + >> +/* >> +** mul_m1r_s32_z: >> +** mov (z[0-9]+)\.d, z0\.d >> +** movprfx z0\.s, p0/z, \1\.s >> +** neg z0\.s, p0/m, \1\.s >> +** ret >> +*/ >> +TEST_UNIFORM_Z (mul_m1r_s32_z, svint32_t, >> + z0 = svmul_s32_z (p0, svdup_s32 (-1), z0), >> + z0 = svmul_z (p0, svdup_s32 (-1), z0)) >> + >> /* >> ** mul_m127_s32_x: >> ** mul z0\.s, z0\.s, #-127 >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c >> index 530d9fc84a5..c05d184f2fe 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c >> @@ -192,8 +192,7 @@ TEST_UNIFORM_Z (mul_3_s64_m_untied, svint64_t, >> >> /* >> ** mul_m1_s64_m: >> -** mov (z[0-9]+)\.b, #-1 >> -** mul z0\.d, p0/m, z0\.d, \1\.d >> +** neg z0\.d, p0/m, z0\.d >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t, >> @@ -625,7 +624,7 @@ TEST_UNIFORM_Z (mul_255_s64_x, svint64_t, >> >> /* >> ** mul_m1_s64_x: >> -** mul z0\.d, z0\.d, #-1 >> +** neg z0\.d, p0/m, z0\.d >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t, >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c >> index 0c90a8bb832..efc952e3a52 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c >> @@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s8_m_untied, svint8_t, >> >> /* >> ** mul_m1_s8_m: >> -** mov (z[0-9]+)\.b, #-1 >> -** mul z0\.b, p0/m, z0\.b, \1\.b >> +** neg z0\.b, p0/m, z0\.b >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t, >> @@ -587,7 +586,7 @@ TEST_UNIFORM_Z (mul_128_s8_x, svint8_t, >> >> /* >> ** mul_255_s8_x: >> -** mul z0\.b, z0\.b, #-1 >> +** neg z0\.b, p0/m, z0\.b >> ** ret >> */ >> TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, >> @@ -596,7 +595,7 @@ TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, >> >> /* >> ** mul_m1_s8_x: >> -** mul z0\.b, z0\.b, #-1 >> +** neg z0\.b, p0/m, z0\.b >> ** ret >> */ >> TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t, >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c >> b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c >> index c96bb2763dc..60cf8345d6a 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c >> @@ -42,7 +42,9 @@ typedef svuint64_t svuint64_ >> __attribute__((arm_sve_vector_bits(128))); >> TEST_TYPES_1 (uint64, u64) >> >> #define TEST_VALUES_S_1(B, OP1, OP2) \ >> - F (int##B, s##B, x, OP1, OP2) >> + F (int##B, s##B, x, OP1, OP2) >> \ >> + F (int##B, s##B, z, OP1, OP2) >> \ >> + F (int##B, s##B, m, OP1, OP2) >> >> #define TEST_VALUES_S >> \ >> TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ >> @@ -60,7 +62,11 @@ typedef svuint64_t svuint64_ >> __attribute__((arm_sve_vector_bits(128))); >> TEST_VALUES_S_1 (32, INT32_MAX, -5) >> \ >> TEST_VALUES_S_1 (64, INT64_MAX, -5) >> \ >> TEST_VALUES_S_1 (32, INT32_MIN, -4) >> \ >> - TEST_VALUES_S_1 (64, INT64_MIN, -4) >> + TEST_VALUES_S_1 (64, INT64_MIN, -4) >> \ >> + TEST_VALUES_S_1 (32, INT32_MAX, -1) >> \ >> + TEST_VALUES_S_1 (32, -7, -1) >> \ >> + TEST_VALUES_S_1 (64, INT64_MIN, -1) >> \ >> + TEST_VALUES_S_1 (64, 16, -1) >> >> #define TEST_VALUES_U_1(B, OP1, OP2) \ >> F (uint##B, u##B, x, OP1, OP2) >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c >> b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c >> index c369d5be167..eb897d622fc 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c >> @@ -44,7 +44,9 @@ typedef svuint64_t svuint64_ >> __attribute__((arm_sve_vector_bits(128))); >> TEST_TYPES_1 (uint64, u64) >> >> #define TEST_VALUES_S_1(B, OP1, OP2) \ >> - F (int##B, s##B, x, OP1, OP2) >> + F (int##B, s##B, x, OP1, OP2) >> \ >> + F (int##B, s##B, m, OP1, OP2) >> \ >> + F (int##B, s##B, z, OP1, OP2) >> >> #define TEST_VALUES_S >> \ >> TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ >> @@ -70,7 +72,11 @@ typedef svuint64_t svuint64_ >> __attribute__((arm_sve_vector_bits(128))); >> TEST_VALUES_S_1 (32, INT32_MAX, -5) >> \ >> TEST_VALUES_S_1 (64, INT64_MAX, -5) >> \ >> TEST_VALUES_S_1 (32, INT32_MIN, -4) >> \ >> - TEST_VALUES_S_1 (64, INT64_MIN, -4) >> + TEST_VALUES_S_1 (64, INT64_MIN, -4) >> \ >> + TEST_VALUES_S_1 (32, INT32_MAX, -1) >> \ >> + TEST_VALUES_S_1 (32, -7, -1) >> \ >> + TEST_VALUES_S_1 (64, INT64_MIN, -1) >> \ >> + TEST_VALUES_S_1 (64, 16, -1) >> >> #define TEST_VALUES_U_1(B, OP1, OP2) \ >> F (uint##B, u##B, x, OP1, OP2)
smime.p7s
Description: S/MIME cryptographic signature