This patch adds support for scatter stores of partial vectors, where the vector base or offset elements can be wider than the elements being stored.
Tested on aarch64-linux-gnu and applied as r278347. Richard 2019-11-16 Richard Sandiford <richard.sandif...@arm.com> gcc/ * config/aarch64/aarch64-sve.md (scatter_store<SVE_FULL_SD:mode><v_int_equiv>): Extend to... (scatter_store<SVE_24:mode><v_int_container>): ...this. (mask_scatter_store<SVE_FULL_S:mode><v_int_equiv>): Extend to... (mask_scatter_store<SVE_4:mode><v_int_equiv>): ...this. (mask_scatter_store<SVE_FULL_D:mode><v_int_equiv>): Extend to... (mask_scatter_store<SVE_2:mode><v_int_equiv>): ...this. (*mask_scatter_store<mode><v_int_container>_<su>xtw_unpacked): New pattern. (*mask_scatter_store<SVE_FULL_D:mode><v_int_equiv>_sxtw): Extend to... (*mask_scatter_store<SVE_2:mode><v_int_equiv>_sxtw): ...this. (*mask_scatter_store<SVE_FULL_D:mode><v_int_equiv>_uxtw): Extend to... (*mask_scatter_store<SVE_2:mode><v_int_equiv>_uxtw): ...this. gcc/testsuite/ * gcc.target/aarch64/sve/scatter_store_1.c (TEST_LOOP): Start at 0. (TEST_ALL): Add tests for 8-bit and 16-bit elements. * gcc.target/aarch64/sve/scatter_store_2.c: Update accordingly. * gcc.target/aarch64/sve/scatter_store_3.c (TEST_LOOP): Start at 0. (TEST_ALL): Add tests for 8-bit and 16-bit elements. * gcc.target/aarch64/sve/scatter_store_4.c: Update accordingly. * gcc.target/aarch64/sve/scatter_store_5.c (TEST_LOOP): Start at 0. (TEST_ALL): Add tests for 8-bit, 16-bit and 32-bit elements. * gcc.target/aarch64/sve/scatter_store_8.c: New test. * gcc.target/aarch64/sve/scatter_store_9.c: Likewise. Index: gcc/config/aarch64/aarch64-sve.md =================================================================== --- gcc/config/aarch64/aarch64-sve.md 2019-11-16 11:26:06.895163107 +0000 +++ gcc/config/aarch64/aarch64-sve.md 2019-11-16 11:28:29.386158694 +0000 @@ -2135,15 +2135,15 @@ (define_insn "@aarch64_stnt1<mode>" ;; ------------------------------------------------------------------------- ;; Unpredicated scatter stores. -(define_expand "scatter_store<mode><v_int_equiv>" +(define_expand "scatter_store<mode><v_int_container>" [(set (mem:BLK (scratch)) (unspec:BLK [(match_dup 5) (match_operand:DI 0 "aarch64_sve_gather_offset_<Vesize>") - (match_operand:<V_INT_EQUIV> 1 "register_operand") + (match_operand:<V_INT_CONTAINER> 1 "register_operand") (match_operand:DI 2 "const_int_operand") (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>") - (match_operand:SVE_FULL_SD 4 "register_operand")] + (match_operand:SVE_24 4 "register_operand")] UNSPEC_ST1_SCATTER))] "TARGET_SVE" { @@ -2153,48 +2153,74 @@ (define_expand "scatter_store<mode><v_in ;; Predicated scatter stores for 32-bit elements. Operand 2 is true for ;; unsigned extension and false for signed extension. -(define_insn "mask_scatter_store<mode><v_int_equiv>" +(define_insn "mask_scatter_store<mode><v_int_container>" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") - (match_operand:DI 0 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk") + (match_operand:DI 0 "aarch64_sve_gather_offset_<Vesize>" "Z, vgw, rk, rk, rk, rk") (match_operand:VNx4SI 1 "register_operand" "w, w, w, w, w, w") (match_operand:DI 2 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1") - (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i") - (match_operand:SVE_FULL_S 4 "register_operand" "w, w, w, w, w, w")] + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i") + (match_operand:SVE_4 4 "register_operand" "w, w, w, w, w, w")] UNSPEC_ST1_SCATTER))] "TARGET_SVE" "@ - st1w\t%4.s, %5, [%1.s] - st1w\t%4.s, %5, [%1.s, #%0] - st1w\t%4.s, %5, [%0, %1.s, sxtw] - st1w\t%4.s, %5, [%0, %1.s, uxtw] - st1w\t%4.s, %5, [%0, %1.s, sxtw %p3] - st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]" + st1<Vesize>\t%4.s, %5, [%1.s] + st1<Vesize>\t%4.s, %5, [%1.s, #%0] + st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw] + st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw] + st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw %p3] + st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw %p3]" ) ;; Predicated scatter stores for 64-bit elements. The value of operand 2 ;; doesn't matter in this case. -(define_insn "mask_scatter_store<mode><v_int_equiv>" +(define_insn "mask_scatter_store<mode><v_int_container>" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl") - (match_operand:DI 0 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk") + (match_operand:DI 0 "aarch64_sve_gather_offset_<Vesize>" "Z, vgd, rk, rk") (match_operand:VNx2DI 1 "register_operand" "w, w, w, w") (match_operand:DI 2 "const_int_operand") - (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i") - (match_operand:SVE_FULL_D 4 "register_operand" "w, w, w, w")] + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, i") + (match_operand:SVE_2 4 "register_operand" "w, w, w, w")] UNSPEC_ST1_SCATTER))] "TARGET_SVE" "@ - st1d\t%4.d, %5, [%1.d] - st1d\t%4.d, %5, [%1.d, #%0] - st1d\t%4.d, %5, [%0, %1.d] - st1d\t%4.d, %5, [%0, %1.d, lsl %p3]" + st1<Vesize>\t%4.d, %5, [%1.d] + st1<Vesize>\t%4.d, %5, [%1.d, #%0] + st1<Vesize>\t%4.d, %5, [%0, %1.d] + st1<Vesize>\t%4.d, %5, [%0, %1.d, lsl %p3]" ) -;; Likewise, but with the offset being sign-extended from 32 bits. -(define_insn_and_rewrite "*mask_scatter_store<mode><v_int_equiv>_sxtw" +;; Likewise, but with the offset being extended from 32 bits. +(define_insn_and_rewrite "*mask_scatter_store<mode><v_int_container>_<su>xtw_unpacked" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl") + (match_operand:DI 0 "register_operand" "rk, rk") + (unspec:VNx2DI + [(match_operand 6) + (ANY_EXTEND:VNx2DI + (match_operand:VNx2SI 1 "register_operand" "w, w"))] + UNSPEC_PRED_X) + (match_operand:DI 2 "const_int_operand") + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i") + (match_operand:SVE_2 4 "register_operand" "w, w")] + UNSPEC_ST1_SCATTER))] + "TARGET_SVE" + "@ + st1<Vesize>\t%4.d, %5, [%0, %1.d, <su>xtw] + st1<Vesize>\t%4.d, %5, [%0, %1.d, <su>xtw %p3]" + "&& !CONSTANT_P (operands[6])" + { + operands[6] = CONSTM1_RTX (<VPRED>mode); + } +) + +;; Likewise, but with the offset being truncated to 32 bits and then +;; sign-extended. +(define_insn_and_rewrite "*mask_scatter_store<mode><v_int_container>_sxtw" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl") @@ -2206,21 +2232,22 @@ (define_insn_and_rewrite "*mask_scatter_ (match_operand:VNx2DI 1 "register_operand" "w, w")))] UNSPEC_PRED_X) (match_operand:DI 2 "const_int_operand") - (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i") - (match_operand:SVE_FULL_D 4 "register_operand" "w, w")] + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i") + (match_operand:SVE_2 4 "register_operand" "w, w")] UNSPEC_ST1_SCATTER))] "TARGET_SVE" "@ - st1d\t%4.d, %5, [%0, %1.d, sxtw] - st1d\t%4.d, %5, [%0, %1.d, sxtw %p3]" - "&& !rtx_equal_p (operands[5], operands[6])" + st1<Vesize>\t%4.d, %5, [%0, %1.d, sxtw] + st1<Vesize>\t%4.d, %5, [%0, %1.d, sxtw %p3]" + "&& !CONSTANT_P (operands[6])" { - operands[6] = copy_rtx (operands[5]); + operands[6] = CONSTM1_RTX (<VPRED>mode); } ) -;; Likewise, but with the offset being zero-extended from 32 bits. -(define_insn "*mask_scatter_store<mode><v_int_equiv>_uxtw" +;; Likewise, but with the offset being truncated to 32 bits and then +;; zero-extended. +(define_insn "*mask_scatter_store<mode><v_int_container>_uxtw" [(set (mem:BLK (scratch)) (unspec:BLK [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl") @@ -2229,13 +2256,13 @@ (define_insn "*mask_scatter_store<mode>< (match_operand:VNx2DI 1 "register_operand" "w, w") (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate")) (match_operand:DI 2 "const_int_operand") - (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i") - (match_operand:SVE_FULL_D 4 "register_operand" "w, w")] + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i") + (match_operand:SVE_2 4 "register_operand" "w, w")] UNSPEC_ST1_SCATTER))] "TARGET_SVE" "@ - st1d\t%4.d, %5, [%0, %1.d, uxtw] - st1d\t%4.d, %5, [%0, %1.d, uxtw %p3]" + st1<Vesize>\t%4.d, %5, [%0, %1.d, uxtw] + st1<Vesize>\t%4.d, %5, [%0, %1.d, uxtw %p3]" ) ;; ------------------------------------------------------------------------- Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c 2019-03-08 18:14:29.792994691 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c 2019-11-16 11:28:29.386158694 +0000 @@ -13,11 +13,15 @@ #define TEST_LOOP(DATA_TYPE, BITS) \ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ INDEX##BITS *indices, int n) \ { \ - for (int i = 9; i < n; ++i) \ + for (int i = 0; i < n; ++i) \ dest[indices[i]] = src[i] + 1; \ } #define TEST_ALL(T) \ + T (int8_t, 32) \ + T (uint8_t, 32) \ + T (int16_t, 32) \ + T (uint16_t, 32) \ T (int32_t, 32) \ T (uint32_t, 32) \ T (float, 32) \ @@ -27,5 +31,7 @@ #define TEST_ALL(T) \ TEST_ALL (TEST_LOOP) +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c 2019-03-08 18:14:29.764994797 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c 2019-11-16 11:28:29.386158694 +0000 @@ -6,5 +6,7 @@ #define INDEX64 uint64_t #include "scatter_store_1.c" +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c 2019-03-08 18:14:29.768994780 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c 2019-11-16 11:28:29.386158694 +0000 @@ -8,17 +8,20 @@ #define INDEX32 int32_t #define INDEX64 int64_t #endif -/* Invoked 18 times for each data size. */ #define TEST_LOOP(DATA_TYPE, BITS) \ void __attribute__ ((noinline, noclone)) \ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ INDEX##BITS *indices, int n) \ { \ - for (int i = 9; i < n; ++i) \ + for (int i = 0; i < n; ++i) \ *(DATA_TYPE *) ((char *) dest + indices[i]) = src[i] + 1; \ } #define TEST_ALL(T) \ + T (int8_t, 32) \ + T (uint8_t, 32) \ + T (int16_t, 32) \ + T (uint16_t, 32) \ T (int32_t, 32) \ T (uint32_t, 32) \ T (float, 32) \ @@ -28,5 +31,7 @@ #define TEST_ALL(T) \ TEST_ALL (TEST_LOOP) +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c 2019-03-08 18:14:29.772994767 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c 2019-11-16 11:28:29.386158694 +0000 @@ -6,5 +6,7 @@ #define INDEX64 uint64_t #include "scatter_store_3.c" +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c 2019-03-08 18:14:29.776994751 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c 2019-11-16 11:28:29.386158694 +0000 @@ -3,21 +3,29 @@ #include <stdint.h> -/* Invoked 18 times for each data size. */ #define TEST_LOOP(DATA_TYPE) \ void __attribute__ ((noinline, noclone)) \ f_##DATA_TYPE (DATA_TYPE *restrict *dest, DATA_TYPE *restrict src, \ int n) \ { \ - for (int i = 9; i < n; ++i) \ + for (int i = 0; i < n; ++i) \ *dest[i] = src[i] + 1; \ } #define TEST_ALL(T) \ + T (int8_t) \ + T (uint8_t) \ + T (int16_t) \ + T (uint16_t) \ + T (int32_t) \ + T (uint32_t) \ T (int64_t) \ T (uint64_t) \ T (double) TEST_ALL (TEST_LOOP) +/* We assume this isn't profitable for bytes. */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_8.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_8.c 2019-11-16 11:28:29.386158694 +0000 @@ -0,0 +1,46 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX32 +#define INDEX16 int16_t +#define INDEX32 int32_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS *indices, INDEX##BITS mask, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[(INDEX##BITS) (indices[i] + mask)] = src[i]; \ + } + +#define TEST_ALL(T) \ + T (int8_t, 16) \ + T (uint8_t, 16) \ + T (int16_t, 16) \ + T (uint16_t, 16) \ + T (_Float16, 16) \ + T (int32_t, 16) \ + T (uint32_t, 16) \ + T (float, 16) \ + T (int64_t, 32) \ + T (uint64_t, 32) \ + T (double, 32) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, sxtw 3\]\n} 3 } } */ + +/* { dg-final { scan-assembler-times {\tsxt.\tz} 8 } } */ +/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.s,} 8 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/scatter_store_9.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/scatter_store_9.c 2019-11-16 11:28:29.386158694 +0000 @@ -0,0 +1,20 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */ + +#define INDEX16 uint16_t +#define INDEX32 uint32_t + +#include "scatter_store_8.c" + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, uxtw 3\]\n} 3 } } */ + +/* { dg-final { scan-assembler-times {\tuxt.\tz} 8 } } */ +/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.s,} 8 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */