Claudio Bantaloukas <claudio.bantalou...@arm.com> writes: > [...] > @@ -231,12 +231,12 @@ CONSTEXPR const group_suffix_info group_suffixes[] = { > #define TYPES_all_arith(S, D) \ > TYPES_all_float (S, D), TYPES_all_integer (S, D) > > -/* _bf16 > +/* _mf8 _bf16 > _f16 _f32 _f64 > _s8 _s16 _s32 _s64 > _u8 _u16 _u32 _u64. */ > #define TYPES_all_data(S, D) \ > - S (bf16), TYPES_all_arith (S, D) > + S(mf8), S (bf16), TYPES_all_arith (S, D)
Sorry for the clash, but I've since pushed the SVE2p1 patches, which redefine all_data in terms of separate b_data, h_data, s_data, and d_data. This would now be part of b_data, and we should now get things like svluti2_lane_zt_mf8 for free. We should probably also add mf8 to things like za_bhsd_data, but that can be a separate follow-on patch. > +/* > +** caller_0: > +** ... > +** mov (z[0-9]+\.b), w2 > +** ... > +** st1b \1, p[0-7], \[x1\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +caller_0 (mfloat8_t *ptr, mfloat8_t in) > +{ > + callee_0 (ptr, svdup_mf8 (in)); > +} w2 isn't a meaningful register here, since the data should be in via b0. I suppose for now we should make the move into w2 as well, with a note to say that this should be optimised away later. (Although the hard-coded w2 should be replaced with (w[0-9]+) for the move in and \1 for the move out.) Same for the other callers in this file. Looks good otherwise, thanks! Richard > + > [...] > +/* > +** caller_1: > +** ... > +** mov (z[0-9]+\.b), w3 > +** ... > +** st1b \1, p[0-7], \[x2\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +caller_1 (mfloat8_t *ptr, mfloat8_t in) > +{ > + callee_1 (ptr, 1, svdup_mf8 (in)); > +} > + > +/* > +** callee_7: > +** ... > +** ld1b (z[0-9]+\.b), (p[0-7])/z, \[x7\] > +** ... > +** st1b \1, p[0-7], \[x0\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +callee_7 (mfloat8_t *ptr, ...) > +{ > + va_list va; > + svmfloat8_t vec; > + > + va_start (va, ptr); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + vec = va_arg (va, svmfloat8_t); > + va_end (va); > + svst1 (svptrue_b8 (), ptr, vec); > +} > + > +/* > +** caller_7: > +** ... > +** mov (z[0-9]+\.b), w8 > +** ... > +** st1b \1, p[0-7], \[x7\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +caller_7 (mfloat8_t *ptr, mfloat8_t in) > +{ > + callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_mf8 (in)); > +} > + > +/* FIXME: We should be able to get rid of the va_list object. */ > +/* > +** callee_8: > +** sub sp, sp, #([0-9]+) > +** ... > +** ldr (x[0-9]+), \[sp, \1\] > +** ... > +** ld1b (z[0-9]+\.b), (p[0-7])/z, \[\2\] > +** ... > +** st1b \3, \4, \[x0\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +callee_8 (mfloat8_t *ptr, ...) > +{ > + va_list va; > + svmfloat8_t vec; > + > + va_start (va, ptr); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + va_arg (va, int); > + vec = va_arg (va, svmfloat8_t); > + va_end (va); > + svst1 (svptrue_b8 (), ptr, vec); > +} > + > +/* > +** caller_8: > +** ... > +** mov (z[0-9]+\.b), w1 > +** ... > +** st1b \1, p[0-7], \[(x[0-9]+)\] > +** ... > +** str \2, \[sp\] > +** ... > +** ret > +*/ > +void __attribute__((noipa)) > +caller_8 (mfloat8_t *ptr, mfloat8_t in) > +{ > + callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_mf8 (in)); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c > new file mode 100644 > index 00000000000..19cc739e7ab > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c > @@ -0,0 +1,31 @@ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include "test_sve_acle.h" > + > +/* > +** tbl2_mf8_tied1: > +** tbl z0\.b, {z0\.b(?:, | - )z1\.b}, z4\.b > +** ret > +*/ > +TEST_TBL2 (tbl2_mf8_tied1, svmfloat8x2_t, svmfloat8_t, svuint8_t, > + z0_res = svtbl2_mf8 (z0, z4), > + z0_res = svtbl2 (z0, z4)) > + > +/* > +** tbl2_mf8_tied2: > +** tbl z0\.b, {z1\.b(?:, | - )z2\.b}, z0\.b > +** ret > +*/ > +TEST_TBL2_REV (tbl2_mf8_tied2, svmfloat8x2_t, svmfloat8_t, svuint8_t, > + z0_res = svtbl2_mf8 (z1, z0), > + z0_res = svtbl2 (z1, z0)) > + > +/* > +** tbl2_mf8_untied: > +** tbl z0\.b, {z2\.b(?:, | - )z3\.b}, z4\.b > +** ret > +*/ > +TEST_TBL2 (tbl2_mf8_untied, svmfloat8x2_t, svmfloat8_t, svuint8_t, > + z0_res = svtbl2_mf8 (z2, z4), > + z0_res = svtbl2 (z2, z4)) > + > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c > new file mode 100644 > index 00000000000..ba0fef3934b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c > @@ -0,0 +1,37 @@ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include "test_sve_acle.h" > + > +/* > +** tbx_mf8_tied1: > +** tbx z0\.b, z1\.b, z4\.b > +** ret > +*/ > +TEST_DUAL_Z (tbx_mf8_tied1, svmfloat8_t, svuint8_t, > + z0 = svtbx_mf8 (z0, z1, z4), > + z0 = svtbx (z0, z1, z4)) > + > +/* Bad RA choice: no preferred output sequence. */ > +TEST_DUAL_Z (tbx_mf8_tied2, svmfloat8_t, svuint8_t, > + z0 = svtbx_mf8 (z1, z0, z4), > + z0 = svtbx (z1, z0, z4)) > + > +/* Bad RA choice: no preferred output sequence. */ > +TEST_DUAL_Z_REV (tbx_mf8_tied3, svmfloat8_t, svuint8_t, > + z0_res = svtbx_mf8 (z4, z5, z0), > + z0_res = svtbx (z4, z5, z0)) > + > +/* > +** tbx_mf8_untied: > +** ( > +** mov z0\.d, z1\.d > +** tbx z0\.b, z2\.b, z4\.b > +** | > +** tbx z1\.b, z2\.b, z4\.b > +** mov z0\.d, z1\.d > +** ) > +** ret > +*/ > +TEST_DUAL_Z (tbx_mf8_untied, svmfloat8_t, svuint8_t, > + z0 = svtbx_mf8 (z1, z2, z4), > + z0 = svtbx (z1, z2, z4)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c > new file mode 100644 > index 00000000000..12cf0d2c365 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c > @@ -0,0 +1,50 @@ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! > ilp32 } } } } */ > + > +#include "test_sve_acle.h" > + > +/* > +** whilerw_rr_mf8: > +** whilerw p0\.b, x0, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilerw_rr_mf8, const mfloat8_t *, > + p0 = svwhilerw_mf8 (x0, x1), > + p0 = svwhilerw (x0, x1)) > + > +/* > +** whilerw_0r_mf8: > +** whilerw p0\.b, xzr, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilerw_0r_mf8, const mfloat8_t *, > + p0 = svwhilerw_mf8 ((const mfloat8_t *) 0, x1), > + p0 = svwhilerw ((const mfloat8_t *) 0, x1)) > + > +/* > +** whilerw_cr_mf8: > +** mov (x[0-9]+), #?1073741824 > +** whilerw p0\.b, \1, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilerw_cr_mf8, const mfloat8_t *, > + p0 = svwhilerw_mf8 ((const mfloat8_t *) 1073741824, x1), > + p0 = svwhilerw ((const mfloat8_t *) 1073741824, x1)) > + > +/* > +** whilerw_r0_mf8: > +** whilerw p0\.b, x0, xzr > +** ret > +*/ > +TEST_COMPARE_S (whilerw_r0_mf8, const mfloat8_t *, > + p0 = svwhilerw_mf8 (x0, (const mfloat8_t *) 0), > + p0 = svwhilerw (x0, (const mfloat8_t *) 0)) > + > +/* > +** whilerw_rc_mf8: > +** mov (x[0-9]+), #?1073741824 > +** whilerw p0\.b, x0, \1 > +** ret > +*/ > +TEST_COMPARE_S (whilerw_rc_mf8, const mfloat8_t *, > + p0 = svwhilerw_mf8 (x0, (const mfloat8_t *) 1073741824), > + p0 = svwhilerw (x0, (const mfloat8_t *) 1073741824)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c > new file mode 100644 > index 00000000000..c4023a2fbff > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c > @@ -0,0 +1,50 @@ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! > ilp32 } } } } */ > + > +#include "test_sve_acle.h" > + > +/* > +** whilewr_rr_mf8: > +** whilewr p0\.b, x0, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilewr_rr_mf8, const mfloat8_t *, > + p0 = svwhilewr_mf8 (x0, x1), > + p0 = svwhilewr (x0, x1)) > + > +/* > +** whilewr_0r_mf8: > +** whilewr p0\.b, xzr, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilewr_0r_mf8, const mfloat8_t *, > + p0 = svwhilewr_mf8 ((const mfloat8_t *) 0, x1), > + p0 = svwhilewr ((const mfloat8_t *) 0, x1)) > + > +/* > +** whilewr_cr_mf8: > +** mov (x[0-9]+), #?1073741824 > +** whilewr p0\.b, \1, x1 > +** ret > +*/ > +TEST_COMPARE_S (whilewr_cr_mf8, const mfloat8_t *, > + p0 = svwhilewr_mf8 ((const mfloat8_t *) 1073741824, x1), > + p0 = svwhilewr ((const mfloat8_t *) 1073741824, x1)) > + > +/* > +** whilewr_r0_mf8: > +** whilewr p0\.b, x0, xzr > +** ret > +*/ > +TEST_COMPARE_S (whilewr_r0_mf8, const mfloat8_t *, > + p0 = svwhilewr_mf8 (x0, (const mfloat8_t *) 0), > + p0 = svwhilewr (x0, (const mfloat8_t *) 0)) > + > +/* > +** whilewr_rc_mf8: > +** mov (x[0-9]+), #?1073741824 > +** whilewr p0\.b, x0, \1 > +** ret > +*/ > +TEST_COMPARE_S (whilewr_rc_mf8, const mfloat8_t *, > + p0 = svwhilewr_mf8 (x0, (const mfloat8_t *) 1073741824), > + p0 = svwhilewr (x0, (const mfloat8_t *) 1073741824))