Hi all, I'd like to submit the following patch implementing the bfloat16_t neon related copy intrinsics: vcopy_lane_bf16, vcopyq_lane_bf16, vcopyq_laneq_bf16, vcopy_laneq_bf16.
Please see refer to: ACLE <https://developer.arm.com/docs/101028/latest> ISA <https://developer.arm.com/docs/ddi0596/latest> Regtested and bootstrapped. Regards Andrea
>From d1335c0f49df849b87ee522e9507023113051839 Mon Sep 17 00:00:00 2001 From: Andrea Corallo <andrea.cora...@arm.com> Date: Thu, 8 Oct 2020 12:29:00 +0200 Subject: [PATCH] aarch64: Add vcopy(q)__lane(q)_bf16 intrinsics gcc/ChangeLog 2020-10-20 Andrea Corallo <andrea.cora...@arm.com> * config/aarch64/arm_neon.h (vcopy_lane_bf16, vcopyq_lane_bf16) (vcopyq_laneq_bf16, vcopy_laneq_bf16): New intrinsics. gcc/testsuite/ChangeLog 2020-10-20 Andrea Corallo <andrea.cora...@arm.com> * gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c: New test. * gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c: Likewise. --- gcc/config/aarch64/arm_neon.h | 36 +++++++++++++++++++ .../bf16_vect_copy_lane_1.c | 32 +++++++++++++++++ .../vcopy_lane_bf16_indices_1.c | 18 ++++++++++ .../vcopy_lane_bf16_indices_2.c | 18 ++++++++++ .../vcopy_laneq_bf16_indices_1.c | 17 +++++++++ .../vcopy_laneq_bf16_indices_2.c | 17 +++++++++ .../vcopyq_lane_bf16_indices_1.c | 17 +++++++++ .../vcopyq_lane_bf16_indices_2.c | 17 +++++++++ .../vcopyq_laneq_bf16_indices_1.c | 17 +++++++++ .../vcopyq_laneq_bf16_indices_2.c | 17 +++++++++ 10 files changed, 206 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0088ea9896f..9c801661775 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -35155,6 +35155,42 @@ vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a) return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a); } +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + #pragma GCC pop_options /* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c new file mode 100644 index 00000000000..d5aa215c21a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c @@ -0,0 +1,32 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-additional-options "-march=armv8.2-a+bf16 -O3 --save-temps -std=gnu90" } */ + +#include "arm_neon.h" + +bfloat16x4_t __attribute__((noinline,noclone)) +test_vcopy_lane_bf16 (bfloat16x4_t a, bfloat16x4_t b) +{ + return vcopy_lane_bf16 (a, 1, b, 2); +} + +bfloat16x8_t __attribute__((noinline,noclone)) +test_vcopyq_lane_bf16 (bfloat16x8_t a, bfloat16x4_t b) +{ + return vcopyq_lane_bf16 (a, 1, b, 2); +} + +bfloat16x4_t __attribute__((noinline,noclone)) +test_vcopy_laneq_bf16 (bfloat16x4_t a, bfloat16x8_t b) +{ + return vcopy_laneq_bf16 (a, 1, b, 2); +} + +bfloat16x8_t __attribute__((noinline,noclone)) +test_vcopyq_laneq_bf16 (bfloat16x8_t a, bfloat16x8_t b) +{ + return vcopyq_laneq_bf16 (a, 1, b, 2); +} + +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[1\\\], v1.h\\\[2\\\]" 2 } } */ +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[1\\\], v1.h\\\[0\\\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c new file mode 100644 index 00000000000..4b9a3b210a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c @@ -0,0 +1,18 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4_t +test_vcopy_lane_bf16 (bfloat16x4_t a, bfloat16x4_t b) +{ + bfloat16x4_t res; + res = vcopy_lane_bf16 (a, 0, b, 4); + res = vcopy_lane_bf16 (a, 0, b, -1); + return res; +} + +/* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ +/* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c new file mode 100644 index 00000000000..659f0f210d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c @@ -0,0 +1,18 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4_t +test_vcopy_lane_bf16 (bfloat16x4_t a, bfloat16x4_t b) +{ + bfloat16x4_t res; + res = vcopy_lane_bf16 (a, -1, b, 2); + res = vcopy_lane_bf16 (a, 4, b, 2); + return res; +} + +/* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ +/* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c new file mode 100644 index 00000000000..33cc289dc6d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4_t +test_vcopy_laneq_bf16 (bfloat16x4_t a, bfloat16x8_t b) +{ + bfloat16x4_t res; + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vcopy_laneq_bf16 (a, -1, b, 2); + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vcopy_laneq_bf16 (a, 4, b, 2); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c new file mode 100644 index 00000000000..503cd0f2e45 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x4_t +test_vcopy_laneq_bf16 (bfloat16x4_t a, bfloat16x8_t b) +{ + bfloat16x4_t res; + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopy_laneq_bf16 (a, 1, b, -1); + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopy_laneq_bf16 (a, 1, b, 8); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c new file mode 100644 index 00000000000..a46f54ab0ca --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8_t +test_vcopyq_lane_bf16 (bfloat16x8_t a, bfloat16x4_t b) +{ + bfloat16x8_t res; + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_lane_bf16 (a, -1, b, 2); + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_lane_bf16 (a, 8, b, 2); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c new file mode 100644 index 00000000000..100e5dd40ef --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8_t +test_vcopyq_lane_bf16 (bfloat16x8_t a, bfloat16x4_t b) +{ + bfloat16x8_t res; + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vcopyq_lane_bf16 (a, 2, b, -1); + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + res = vcopyq_lane_bf16 (a, 2, b, 4); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c new file mode 100644 index 00000000000..914d7318f8d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8_t +test_vcopyq_laneq_bf16 (bfloat16x8_t a, bfloat16x8_t b) +{ + bfloat16x8_t res; + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_laneq_bf16 (a, -1, b, 2); + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_laneq_bf16 (a, 8, b, 2); + return res; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c new file mode 100644 index 00000000000..244e6eb514e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c @@ -0,0 +1,17 @@ +#include <arm_neon.h> + +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +bfloat16x8_t +test_vcopyq_laneq_bf16 (bfloat16x8_t a, bfloat16x8_t b) +{ + bfloat16x8_t res; + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_laneq_bf16 (a, 2, b, -1); + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + res = vcopyq_laneq_bf16 (a, 2, b, 8); + return res; +} -- 2.20.1