gcc/ChangeLog: * config/i386/avx10_2-512mediaintrin.h: Add new intrins. * config/i386/avx10_2mediaintrin.h: Ditto. * config/i386/i386-builtin.def: Add new builtins. * config/i386/i386-builtins.cc (def_builtin): Handle shared builtins between AVXVNNIINT16 and AVX10.2. * config/i386/i386-expand.cc (ix86_check_builtin_isa_match): Ditto. * config/i386/sse.md (unspec): Add UNSPEC_VDPPHPS. (<mask_codefor><sse4_1_avx2>_mpsadbw<mask_name>): New define_insn. (avx10_2_mpsadbw<mask_name>): Ditto. (vpdp<vpdpwprodtype>_<mode>): Add AVX10_2_256. (vpdp<vpdpwprodtype>_v16si): New defin_insn. (vpdp<vpdpwprodtype>_<mode>_mask): Ditto. (*vpdp<vpdpwprodtype>_<mode>_maskz): Ditto. (vpdp<vpdpwprodtype>_<mode>_maskz): New expander. (vdpphps_<mode>): New define_insn. (vdpphps_<mode>_mask): Ditto. (*vdpphps_<mode>_maskz): Ditto. (vdpphps_<mode>_maskz): New expander.
gcc/testsuite/ChangeLog: * gcc.target/i386/avxvnniint16-1.c: Add new macro test. * gcc.target/i386/avx-1.c: Ditto. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-14.c: Ditto. * gcc.target/i386/sse-22.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/avx10_2-512-media-1.c: Add test. * gcc.target/i386/avx10_2-media-1.c: Ditto. * gcc.target/i386/avxvnniint16-builtin.c: New test. * gcc.target/i386/avx10_2-512-vdpphps-2.c: Ditto. * gcc.target/i386/avx10_2-512-vmpsadbw-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwsud-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwsuds-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwusd-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwusds-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwuud-2.c: Ditto. * gcc.target/i386/avx10_2-512-vpdpwuuds-2.c: Ditto. * gcc.target/i386/avx10_2-builtin-2.c: Ditto. * gcc.target/i386/avx10_2-vdpphps-2.c: Ditto. * gcc.target/i386/avx10_2-vmpsadbw-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwsud-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwsuds-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwusd-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwusds-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwuud-2.c: Ditto. * gcc.target/i386/avx10_2-vpdpwuuds-2.c: Ditto. Co-authored-by: Hongyu Wang <hongyu.w...@intel.com> --- gcc/config/i386/avx10_2-512mediaintrin.h | 280 +++++++++++ gcc/config/i386/avx10_2mediaintrin.h | 472 ++++++++++++++++++ gcc/config/i386/i386-builtin.def | 76 ++- gcc/config/i386/i386-builtins.cc | 11 +- gcc/config/i386/i386-expand.cc | 3 + gcc/config/i386/sse.md | 145 +++++- gcc/testsuite/gcc.target/i386/avx-1.c | 8 + .../gcc.target/i386/avx10_2-512-media-1.c | 60 +++ .../gcc.target/i386/avx10_2-512-vdpphps-2.c | 71 +++ .../gcc.target/i386/avx10_2-512-vmpsadbw-2.c | 93 ++++ .../gcc.target/i386/avx10_2-512-vpdpwsud-2.c | 71 +++ .../gcc.target/i386/avx10_2-512-vpdpwsuds-2.c | 74 +++ .../gcc.target/i386/avx10_2-512-vpdpwusd-2.c | 71 +++ .../gcc.target/i386/avx10_2-512-vpdpwusds-2.c | 74 +++ .../gcc.target/i386/avx10_2-512-vpdpwuud-2.c | 70 +++ .../gcc.target/i386/avx10_2-512-vpdpwuuds-2.c | 73 +++ .../gcc.target/i386/avx10_2-builtin-2.c | 8 + .../gcc.target/i386/avx10_2-media-1.c | 112 +++++ .../gcc.target/i386/avx10_2-vdpphps-2.c | 16 + .../gcc.target/i386/avx10_2-vmpsadbw-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwsud-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwsuds-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwusd-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwusds-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwuud-2.c | 16 + .../gcc.target/i386/avx10_2-vpdpwuuds-2.c | 16 + .../gcc.target/i386/avxvnniint16-1.c | 42 +- .../gcc.target/i386/avxvnniint16-builtin.c | 8 + gcc/testsuite/gcc.target/i386/sse-13.c | 8 + gcc/testsuite/gcc.target/i386/sse-14.c | 11 + gcc/testsuite/gcc.target/i386/sse-22.c | 11 + gcc/testsuite/gcc.target/i386/sse-23.c | 8 + 32 files changed, 1953 insertions(+), 35 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vdpphps-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vmpsadbw-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsud-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsuds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusd-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuud-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuuds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-builtin-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vdpphps-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vmpsadbw-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsud-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsuds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusd-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuud-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuuds-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avxvnniint16-builtin.c diff --git a/gcc/config/i386/avx10_2-512mediaintrin.h b/gcc/config/i386/avx10_2-512mediaintrin.h index 02d826b24cd..e471c83b1c4 100644 --- a/gcc/config/i386/avx10_2-512mediaintrin.h +++ b/gcc/config/i386/avx10_2-512mediaintrin.h @@ -226,6 +226,286 @@ _mm512_maskz_dpbuuds_epi32 (__mmask16 __U, __m512i __W, (__mmask16) __U); } +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwsud_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsud512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwsud_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsud_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwsud_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsud_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwsuds_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsuds512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwsuds_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsuds_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwsuds_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwsuds_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwusd_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusd512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwusd_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusd_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwusd_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusd_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwusds_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusds512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwusds_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusds_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwusds_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwusds_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwuud_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuud512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwuud_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuud_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwuud_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuud_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwuuds_epi32 (__m512i __W, __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuuds512 ((__v16si) __W, (__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwuuds_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuuds_v16si_mask ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwuuds_epi32 (__mmask16 __U, __m512i __W, + __m512i __A, __m512i __B) +{ + return (__m512i) + __builtin_ia32_vpdpwuuds_v16si_maskz ((__v16si) __W, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpph_ps (__m512 __W, __m512h __A, __m512h __B) +{ + return (__m512) + __builtin_ia32_vdpphps512_mask ((__v16sf) __W, + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpph_ps (__m512 __W, __mmask16 __U, __m512h __A, + __m512h __B) +{ + return (__m512) + __builtin_ia32_vdpphps512_mask ((__v16sf) __W, + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpph_ps (__mmask16 __U, __m512 __W, __m512h __A, + __m512h __B) +{ + return (__m512) + __builtin_ia32_vdpphps512_maskz ((__v16sf) __W, + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mpsadbw_epu8 (__m512i __X, __m512i __Y, const int __M) +{ + return (__m512i) __builtin_ia32_mpsadbw512 ((__v64qi) __X, + (__v64qi) __Y, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mpsadbw_epu8 (__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y, const int __M) +{ + return (__m512i) __builtin_ia32_mpsadbw512_mask ((__v64qi) __X, + (__v64qi) __Y, + __M, + (__v32hi) __W, + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mpsadbw_epu8 (__mmask32 __U, __m512i __X, + __m512i __Y, const int __M) +{ + return (__m512i) __builtin_ia32_mpsadbw512_mask ((__v64qi) __X, + (__v64qi) __Y, + __M, + (__v32hi) _mm512_setzero_epi32 (), + __U); +} +#else +#define _mm512_mpsadbw_epu8(X, Y, M) \ + (__m512i) __builtin_ia32_mpsadbw512 ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(M)) + +#define _mm512_mask_mpsadbw_epu8(W, U, X, Y, M) \ + (__m512i) __builtin_ia32_mpsadbw512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), \ + (int)(M), \ + (__v32hi)(__m512i)(W), \ + (__mmask32)(U)) + +#define _mm512_maskz_mpsadbw_epu8(U, X, Y, M) \ + (__m512i) __builtin_ia32_mpsadbw512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), \ + (int)(M), \ + (__v32hi) _mm512_setzero_epi32 (), \ + (__mmask32)(U)) +#endif + #ifdef __DISABLE_AVX10_2_512__ #undef __DISABLE_AVX10_2_512__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx10_2mediaintrin.h b/gcc/config/i386/avx10_2mediaintrin.h index e668af62e36..5456c185284 100644 --- a/gcc/config/i386/avx10_2mediaintrin.h +++ b/gcc/config/i386/avx10_2mediaintrin.h @@ -70,6 +70,42 @@ #define _mm256_dpbuuds_epi32(W, A, B) \ (__m256i) __builtin_ia32_vpdpbuuds256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) +#define _mm_dpwsud_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwsud128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm_dpwsuds_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwsuds128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm_dpwusd_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwusd128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm_dpwusds_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwusds128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm_dpwuud_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwuud128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm_dpwuuds_epi32(W, A, B) \ + (__m128i) __builtin_ia32_vpdpwuuds128 ((__v4si) (W), (__v4si) (A), (__v4si) (B)) + +#define _mm256_dpwsud_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwsud256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + +#define _mm256_dpwsuds_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwsuds256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + +#define _mm256_dpwusd_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwusd256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + +#define _mm256_dpwusds_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwusds256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + +#define _mm256_dpwuud_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwuud256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + +#define _mm256_dpwuuds_epi32(W, A, B) \ + (__m256i) __builtin_ia32_vpdpwuuds256 ((__v8si) (W), (__v8si) (A), (__v8si) (B)) + extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_dpbssd_epi32 (__m128i __W, __mmask8 __U, @@ -358,6 +394,442 @@ _mm256_maskz_dpbuuds_epi32 (__mmask8 __U, __m256i __W, (__mmask8) __U); } +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwsud_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwsud_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwsud_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwsud_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwsuds_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwsuds_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwsuds_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwsuds_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwusd_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwusd_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwusd_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwusd_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwusds_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwusds_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwusds_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwusds_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwuud_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwuud_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwuud_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwuud_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwuuds_epi32 (__m128i __W, __mmask8 __U, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwuuds_v4si_mask ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwuuds_epi32 (__mmask8 __U, __m128i __W, + __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpwuuds_v4si_maskz ((__v4si) __W, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwsud_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwsud_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwsud_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwsud_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwsuds_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwsuds_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwsuds_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwsuds_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwusd_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwusd_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwusd_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwusd_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwusds_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwusds_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwusds_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwusds_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwuud_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwuud_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwuud_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwuud_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwuuds_epi32 (__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwuuds_v8si_mask ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwuuds_epi32 (__mmask8 __U, __m256i __W, + __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpwuuds_v8si_maskz ((__v8si) __W, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpph_ps (__m256 __W, __m256h __A, __m256h __B) +{ + return (__m256) + __builtin_ia32_vdpphps256_mask ((__v8sf) __W, + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpph_ps (__m256 __W, __mmask8 __U, __m256h __A, + __m256h __B) +{ + return (__m256) + __builtin_ia32_vdpphps256_mask ((__v8sf) __W, + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpph_ps (__mmask8 __U, __m256 __W, __m256h __A, + __m256h __B) +{ + return (__m256) + __builtin_ia32_vdpphps256_maskz ((__v8sf) __W, + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpph_ps (__m128 __W, __m128h __A, __m128h __B) +{ + return (__m128) + __builtin_ia32_vdpphps128_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpph_ps (__m128 __W, __mmask8 __U, __m128h __A, + __m128h __B) +{ + return (__m128) + __builtin_ia32_vdpphps128_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpph_ps (__mmask8 __U, __m128 __W, __m128h __A, + __m128h __B) +{ + return (__m128) + __builtin_ia32_vdpphps128_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mpsadbw_epu8 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128_mask ((__v16qi) __X, + (__v16qi) __Y, + __M, + (__v8hi) __W, + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mpsadbw_epu8 (__mmask8 __U, __m128i __X, + __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128_mask ((__v16qi) __X, + (__v16qi) __Y, + __M, + (__v8hi) _mm_setzero_si128 (), + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mpsadbw_epu8 (__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256_mask ((__v32qi) __X, + (__v32qi) __Y, + __M, + (__v16hi) __W, + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mpsadbw_epu8 (__mmask16 __U, __m256i __X, + __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256_mask ((__v32qi) __X, + (__v32qi) __Y, + __M, + (__v16hi) _mm256_setzero_si256 (), + __U); +} +#else +#define _mm_mask_mpsadbw_epu8(W, U, X, Y, M) \ + (__m128i) __builtin_ia32_mpsadbw128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), \ + (int)(M), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U)) + +#define _mm_maskz_mpsadbw_epu8(U, X, Y, M) \ + (__m128i) __builtin_ia32_mpsadbw128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), \ + (int)(M), \ + (__v8hi) _mm_setzero_si128 (), \ + (__mmask8)(U)) + +#define _mm256_mask_mpsadbw_epu8(W, U, X, Y, M) \ + (__m256i) __builtin_ia32_mpsadbw256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), \ + (int)(M), \ + (__v16hi)(__m256i)(W), \ + (__mmask16)(U)) + +#define _mm256_maskz_mpsadbw_epu8(U, X, Y, M) \ + (__m256i) __builtin_ia32_mpsadbw256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), \ + (int)(M), \ + (__v16hi) _mm256_setzero_si256 (), \ + (__mmask16)(U)) + +#endif #ifdef __DISABLE_AVX10_2_256__ #undef __DISABLE_AVX10_2_256__ diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 5bd9aabdc52..cdf28cd261c 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2762,18 +2762,18 @@ BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT8 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_ BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT8 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpbuuds_v4si, "__builtin_ia32_vpdpbuuds128", IX86_BUILTIN_VPDPBUUDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) /* AVXVNNIINT16 */ -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwusd_v8si, "__builtin_ia32_vpdpwusd256", IX86_BUILTIN_VPDPWUSDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwusds_v8si, "__builtin_ia32_vpdpwusds256", IX86_BUILTIN_VPDPWUSDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwsud_v8si, "__builtin_ia32_vpdpwsud256", IX86_BUILTIN_VPDPWSUDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwsuds_v8si, "__builtin_ia32_vpdpwsuds256", IX86_BUILTIN_VPDPWSUDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwuud_v8si, "__builtin_ia32_vpdpwuud256", IX86_BUILTIN_VPDPWUUDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwuuds_v8si, "__builtin_ia32_vpdpwuuds256", IX86_BUILTIN_VPDPWUUDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwusd_v4si, "__builtin_ia32_vpdpwusd128", IX86_BUILTIN_VPDPWUSDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwusds_v4si, "__builtin_ia32_vpdpwusds128", IX86_BUILTIN_VPDPWUSDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwsud_v4si, "__builtin_ia32_vpdpwsud128", IX86_BUILTIN_VPDPWSUDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwsuds_v4si, "__builtin_ia32_vpdpwsuds128", IX86_BUILTIN_VPDPWSUDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwuud_v4si, "__builtin_ia32_vpdpwuud128", IX86_BUILTIN_VPDPWUUDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) -BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16, CODE_FOR_vpdpwuuds_v4si, "__builtin_ia32_vpdpwuuds128", IX86_BUILTIN_VPDPWUUDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v8si, "__builtin_ia32_vpdpwusd256", IX86_BUILTIN_VPDPWUSDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v8si, "__builtin_ia32_vpdpwusds256", IX86_BUILTIN_VPDPWUSDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v8si, "__builtin_ia32_vpdpwsud256", IX86_BUILTIN_VPDPWSUDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v8si, "__builtin_ia32_vpdpwsuds256", IX86_BUILTIN_VPDPWSUDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v8si, "__builtin_ia32_vpdpwuud256", IX86_BUILTIN_VPDPWUUDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v8si, "__builtin_ia32_vpdpwuuds256", IX86_BUILTIN_VPDPWUUDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v4si, "__builtin_ia32_vpdpwusd128", IX86_BUILTIN_VPDPWUSDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v4si, "__builtin_ia32_vpdpwusds128", IX86_BUILTIN_VPDPWUSDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v4si, "__builtin_ia32_vpdpwsud128", IX86_BUILTIN_VPDPWSUDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v4si, "__builtin_ia32_vpdpwsuds128", IX86_BUILTIN_VPDPWSUDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v4si, "__builtin_ia32_vpdpwuud128", IX86_BUILTIN_VPDPWUUDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) +BDESC (0, OPTION_MASK_ISA2_AVXVNNIINT16 | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v4si, "__builtin_ia32_vpdpwuuds128", IX86_BUILTIN_VPDPWUUDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI) /* VPCLMULQDQ */ BDESC (OPTION_MASK_ISA_VPCLMULQDQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpclmulqdq_v2di, "__builtin_ia32_vpclmulqdq_v2di", IX86_BUILTIN_VPCLMULQDQ2, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT) @@ -3063,6 +3063,58 @@ BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpbuud_v4si_mask, "__builtin_ BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpbuud_v4si_maskz, "__builtin_ia32_vpdpbuud_v4si_maskz", IX86_BUILTIN_VPDPBUUDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpbuuds_v4si_mask, "__builtin_ia32_vpdpbuuds_v4si_mask", IX86_BUILTIN_VPDPBUUDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpbuuds_v4si_maskz, "__builtin_ia32_vpdpbuuds_v4si_maskz", IX86_BUILTIN_VPDPBUUDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsud_v16si, "__builtin_ia32_vpdpwsud512", IX86_BUILTIN_VPDPWSUDV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsuds_v16si, "__builtin_ia32_vpdpwsuds512", IX86_BUILTIN_VPDPWSUDSV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusd_v16si, "__builtin_ia32_vpdpwusd512", IX86_BUILTIN_VPDPWUSDV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusds_v16si, "__builtin_ia32_vpdpwusds512", IX86_BUILTIN_VPDPWUSDSV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuud_v16si, "__builtin_ia32_vpdpwuud512", IX86_BUILTIN_VPDPWUUDV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuuds_v16si, "__builtin_ia32_vpdpwuuds512", IX86_BUILTIN_VPDPWUUDSV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsud_v16si_mask, "__builtin_ia32_vpdpwsud_v16si_mask", IX86_BUILTIN_VPDPWSUDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsud_v16si_maskz, "__builtin_ia32_vpdpwsud_v16si_maskz", IX86_BUILTIN_VPDPWSUDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsuds_v16si_mask, "__builtin_ia32_vpdpwsuds_v16si_mask", IX86_BUILTIN_VPDPWSUDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwsuds_v16si_maskz, "__builtin_ia32_vpdpwsuds_v16si_maskz", IX86_BUILTIN_VPDPWSUDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusd_v16si_mask, "__builtin_ia32_vpdpwusd_v16si_mask", IX86_BUILTIN_VPDPWUSDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusd_v16si_maskz, "__builtin_ia32_vpdpwusd_v16si_maskz", IX86_BUILTIN_VPDPWUSDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusds_v16si_mask, "__builtin_ia32_vpdpwusds_v16si_mask", IX86_BUILTIN_VPDPWUSDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwusds_v16si_maskz, "__builtin_ia32_vpdpwusds_v16si_maskz", IX86_BUILTIN_VPDPWUSDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuud_v16si_mask, "__builtin_ia32_vpdpwuud_v16si_mask", IX86_BUILTIN_VPDPWUUDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuud_v16si_maskz, "__builtin_ia32_vpdpwuud_v16si_maskz", IX86_BUILTIN_VPDPWUUDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuuds_v16si_mask, "__builtin_ia32_vpdpwuuds_v16si_mask", IX86_BUILTIN_VPDPWUUDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vpdpwuuds_v16si_maskz, "__builtin_ia32_vpdpwuuds_v16si_maskz", IX86_BUILTIN_VPDPWUUDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v8si_mask, "__builtin_ia32_vpdpwsud_v8si_mask", IX86_BUILTIN_VPDPWSUDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v8si_maskz, "__builtin_ia32_vpdpwsud_v8si_maskz", IX86_BUILTIN_VPDPWSUDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v8si_mask, "__builtin_ia32_vpdpwsuds_v8si_mask", IX86_BUILTIN_VPDPWSUDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v8si_maskz, "__builtin_ia32_vpdpwsuds_v8si_maskz", IX86_BUILTIN_VPDPWSUDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v8si_mask, "__builtin_ia32_vpdpwusd_v8si_mask", IX86_BUILTIN_VPDPWUSDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v8si_maskz, "__builtin_ia32_vpdpwusd_v8si_maskz", IX86_BUILTIN_VPDPWUSDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v8si_mask, "__builtin_ia32_vpdpwusds_v8si_mask", IX86_BUILTIN_VPDPWUSDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v8si_maskz, "__builtin_ia32_vpdpwusds_v8si_maskz", IX86_BUILTIN_VPDPWUSDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v8si_mask, "__builtin_ia32_vpdpwuud_v8si_mask", IX86_BUILTIN_VPDPWUUDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v8si_maskz, "__builtin_ia32_vpdpwuud_v8si_maskz", IX86_BUILTIN_VPDPWUUDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v8si_mask, "__builtin_ia32_vpdpwuuds_v8si_mask", IX86_BUILTIN_VPDPWUUDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v8si_maskz, "__builtin_ia32_vpdpwuuds_v8si_maskz", IX86_BUILTIN_VPDPWUUDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v4si_mask, "__builtin_ia32_vpdpwsud_v4si_mask", IX86_BUILTIN_VPDPWSUDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsud_v4si_maskz, "__builtin_ia32_vpdpwsud_v4si_maskz", IX86_BUILTIN_VPDPWSUDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v4si_mask, "__builtin_ia32_vpdpwsuds_v4si_mask", IX86_BUILTIN_VPDPWSUDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwsuds_v4si_maskz, "__builtin_ia32_vpdpwsuds_v4si_maskz", IX86_BUILTIN_VPDPWSUDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v4si_mask, "__builtin_ia32_vpdpwusd_v4si_mask", IX86_BUILTIN_VPDPWUSDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusd_v4si_maskz, "__builtin_ia32_vpdpwusd_v4si_maskz", IX86_BUILTIN_VPDPWUSDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v4si_mask, "__builtin_ia32_vpdpwusds_v4si_mask", IX86_BUILTIN_VPDPWUSDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwusds_v4si_maskz, "__builtin_ia32_vpdpwusds_v4si_maskz", IX86_BUILTIN_VPDPWUSDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v4si_mask, "__builtin_ia32_vpdpwuud_v4si_mask", IX86_BUILTIN_VPDPWUUDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuud_v4si_maskz, "__builtin_ia32_vpdpwuud_v4si_maskz", IX86_BUILTIN_VPDPWUUDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v4si_mask, "__builtin_ia32_vpdpwuuds_v4si_mask", IX86_BUILTIN_VPDPWUUDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vpdpwuuds_v4si_maskz, "__builtin_ia32_vpdpwuuds_v4si_maskz", IX86_BUILTIN_VPDPWUUDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vdpphps_v16sf_mask, "__builtin_ia32_vdpphps512_mask", IX86_BUILTIN_VDPPHPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vdpphps_v16sf_maskz, "__builtin_ia32_vdpphps512_maskz", IX86_BUILTIN_VDPPHPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vdpphps_v8sf_mask, "__builtin_ia32_vdpphps256_mask", IX86_BUILTIN_VDPPHPS256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vdpphps_v8sf_maskz, "__builtin_ia32_vdpphps256_maskz", IX86_BUILTIN_VDPPHPS256_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vdpphps_v4sf_mask, "__builtin_ia32_vdpphps128_mask", IX86_BUILTIN_VDPPHPS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_vdpphps_v4sf_maskz, "__builtin_ia32_vdpphps128_maskz", IX86_BUILTIN_VDPPHPS128_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_mpsadbw, "__builtin_ia32_mpsadbw512", IX86_BUILTIN_AVX10_2_MPSADBW, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_mpsadbw_mask, "__builtin_ia32_mpsadbw512_mask", IX86_BUILTIN_VMPSADBW_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx2_mpsadbw_mask, "__builtin_ia32_mpsadbw256_mask", IX86_BUILTIN_VMPSADBW_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_sse4_1_mpsadbw_mask, "__builtin_ia32_mpsadbw128_mask", IX86_BUILTIN_VMPSADBW_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc index 130ba853125..4286eeb80e6 100644 --- a/gcc/config/i386/i386-builtins.cc +++ b/gcc/config/i386/i386-builtins.cc @@ -280,17 +280,18 @@ def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0) && (mask == 0 || (mask & ix86_isa_flags) != 0)) || ((mask & OPTION_MASK_ISA_MMX) != 0 && TARGET_MMX_WITH_SSE) - /* "Unified" builtin used by either AVXVNNI/AVXIFMA/AES/AVXVNNIINT8 - intrinsics or AVX512VNNIVL/AVX512IFMAVL/VAESVL/AVX10.2 non-mask - intrinsics should be defined whenever avxvnni/avxifma/aes/ - avxvnniint8 or avx512vnni && avx512vl/avx512ifma && avx512vl/vaes - && avx512vl/avx10.2 exist. */ + /* "Unified" builtin used by either AVXVNNI/AVXIFMA/AES/ + AVXVNNIINT{8,16} intrinsics or AVX512VNNIVL/AVX512IFMAVL/VAESVL/ + AVX10.2 non-mask intrinsics should be defined whenever avxvnni/ + avxifma/aes/avxvnniint{8,16} or avx512vnni && avx512vl/avx512ifma + && avx512vl/vaes && avx512vl/avx10.2 exist. */ || (mask2 == OPTION_MASK_ISA2_AVXVNNI) || (mask2 == OPTION_MASK_ISA2_AVXIFMA) || (mask2 == (OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16)) || ((mask2 & OPTION_MASK_ISA2_VAES) != 0) || ((mask2 & OPTION_MASK_ISA2_AVXVNNIINT8) != 0) + || ((mask2 & OPTION_MASK_ISA2_AVXVNNIINT16) != 0) || (lang_hooks.builtin_function == lang_hooks.builtin_function_ext_scope)) { diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 200b768f5d9..f1e6bc11f86 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -13299,6 +13299,7 @@ ix86_check_builtin_isa_match (unsigned int fcode, OPTION_MASK_ISA2_AVXNECONVERT OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES) OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8 + OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16 where for each such pair it is sufficient if either of the ISAs is enabled, plus if it is ored with other options also those others. OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */ @@ -13326,6 +13327,8 @@ ix86_check_builtin_isa_match (unsigned int fcode, OPTION_MASK_ISA2_VAES); SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0, OPTION_MASK_ISA2_AVX10_2_256); + SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0, + OPTION_MASK_ISA2_AVX10_2_256); isa = tmp_isa; isa2 = tmp_isa2; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 41d448f57cb..6f76e8f50ad 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -214,6 +214,8 @@ UNSPEC_SM4KEY4 UNSPEC_SM4RNDS4 + ;; For AVX10.2 suppport + UNSPEC_VDPPHPS ]) (define_c_enum "unspecv" [ @@ -465,6 +467,9 @@ (define_mode_iterator VF1_AVX512VL [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) +(define_mode_iterator VF1_AVX10_2 + [(V16SF "TARGET_AVX10_2_512") V8SF V4SF]) + (define_mode_iterator VHFBF [(V32HF "TARGET_EVEX512") V16HF V8HF (V32BF "TARGET_EVEX512") V16BF V8BF]) @@ -23555,6 +23560,31 @@ (set_attr "znver1_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "avx10_2_mpsadbw<mask_name>" + [(set (match_operand:V64QI 0 "register_operand" "=v") + (unspec:V64QI + [(match_operand:V64QI 1 "register_operand" "v") + (match_operand:V64QI 2 "vector_operand" "vm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_MPSADBW))] + "TARGET_AVX10_2_512" + "vmpsadbw\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex")]) + +(define_insn "<mask_codefor><sse4_1_avx2>_mpsadbw<mask_name>" + [(set (match_operand:VI1 0 "register_operand" "=v") + (unspec:VI1 + [(match_operand:VI1 1 "register_operand" "v") + (match_operand:VI1 2 "vector_operand" "vm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_MPSADBW))] + "TARGET_AVX10_2_256" + "vmpsadbw\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<sse4_1_avx2>_packusdw<mask_name>" [(set (match_operand:VI2_AVX2_AVX512BW 0 "register_operand" "=Yr,*x,<v_Yw>") (unspec:VI2_AVX2_AVX512BW @@ -31438,13 +31468,116 @@ }) (define_insn "vpdp<vpdpwprodtype>_<mode>" - [(set (match_operand:VI4_AVX 0 "register_operand" "=x") + [(set (match_operand:VI4_AVX 0 "register_operand" "=v") (unspec:VI4_AVX [(match_operand:VI4_AVX 1 "register_operand" "0") - (match_operand:VI4_AVX 2 "register_operand" "x") - (match_operand:VI4_AVX 3 "nonimmediate_operand" "xjm")] + (match_operand:VI4_AVX 2 "register_operand" "v") + (match_operand:VI4_AVX 3 "nonimmediate_operand" "vm")] VPDPWPROD))] - "TARGET_AVXVNNIINT16" + "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256" + "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "prefix" "maybe_evex")]) + +(define_insn "vpdp<vpdpwprodtype>_v16si" + [(set (match_operand:V16SI 0 "register_operand" "=v") + (unspec:V16SI + [(match_operand:V16SI 1 "register_operand" "0") + (match_operand:V16SI 2 "register_operand" "v") + (match_operand:V16SI 3 "nonimmediate_operand" "vm")] + VPDPWPROD))] + "TARGET_AVX10_2_512" "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}" - [(set_attr "prefix" "vex") - (set_attr "addr" "gpr16")]) + [(set_attr "prefix" "evex")]) + +(define_insn "vpdp<vpdpwprodtype>_<mode>_mask" + [(set (match_operand:VI4_AVX10_2 0 "register_operand" "=v") + (vec_merge:VI4_AVX10_2 + (unspec:VI4_AVX10_2 + [(match_operand:VI4_AVX10_2 1 "register_operand" "0") + (match_operand:VI4_AVX10_2 2 "register_operand" "v") + (match_operand:VI4_AVX10_2 3 "nonimmediate_operand" "vm")] + VPDPWPROD) + (match_dup 1) + (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] + "TARGET_AVX10_2_256" + "vpdp<vpdpwprodtype>\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + [(set_attr "prefix" "evex")]) + +(define_expand "vpdp<vpdpwprodtype>_<mode>_maskz" + [(set (match_operand:VI4_AVX10_2 0 "register_operand") + (vec_merge:VI4_AVX10_2 + (unspec:VI4_AVX10_2 + [(match_operand:VI4_AVX10_2 1 "register_operand") + (match_operand:VI4_AVX10_2 2 "register_operand") + (match_operand:VI4_AVX10_2 3 "nonimmediate_operand")] + VPDPWPROD) + (match_dup 5) + (match_operand:<avx512fmaskmode> 4 "register_operand")))] + "TARGET_AVX10_2_256" + "operands[5] = CONST0_RTX (<MODE>mode);") + +(define_insn "*vpdp<vpdpwprodtype>_<mode>_maskz" + [(set (match_operand:VI4_AVX10_2 0 "register_operand" "=v") + (vec_merge:VI4_AVX10_2 + (unspec:VI4_AVX10_2 + [(match_operand:VI4_AVX10_2 1 "register_operand" "0") + (match_operand:VI4_AVX10_2 2 "register_operand" "v") + (match_operand:VI4_AVX10_2 3 "nonimmediate_operand" "vm")] + VPDPWPROD) + (match_operand:VI4_AVX10_2 5 "const0_operand" "C") + (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] + "TARGET_AVX10_2_256" + "vpdp<vpdpwprodtype>\t{%3, %2, %0%{%4%}%N5|%0%{%4%}%N5, %2, %3}" + [(set_attr "prefix" "evex")]) + +(define_insn "vdpphps_<mode>" + [(set (match_operand:VF1_AVX10_2 0 "register_operand" "=v") + (unspec:VF1_AVX10_2 + [(match_operand:VF1_AVX10_2 1 "register_operand" "0") + (match_operand:VF1_AVX10_2 2 "register_operand" "v") + (match_operand:VF1_AVX10_2 3 "nonimmediate_operand" "vm")] + UNSPEC_VDPPHPS))] + "TARGET_AVX10_2_256" + "vdpphps\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "prefix" "evex")]) + +(define_insn "vdpphps_<mode>_mask" + [(set (match_operand:VF1_AVX10_2 0 "register_operand" "=v") + (vec_merge:VF1_AVX10_2 + (unspec:VF1_AVX10_2 + [(match_operand:VF1_AVX10_2 1 "register_operand" "0") + (match_operand:VF1_AVX10_2 2 "register_operand" "v") + (match_operand:VF1_AVX10_2 3 "nonimmediate_operand" "vm")] + UNSPEC_VDPPHPS) + (match_dup 1) + (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] + "TARGET_AVX10_2_256" + "vdpphps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}" + [(set_attr "prefix" "evex")]) + +(define_expand "vdpphps_<mode>_maskz" + [(match_operand:VF1_AVX10_2 0 "register_operand") + (match_operand:VF1_AVX10_2 1 "register_operand") + (match_operand:VF1_AVX10_2 2 "register_operand") + (match_operand:VF1_AVX10_2 3 "nonimmediate_operand") + (match_operand:<avx512fmaskmode> 4 "register_operand")] + "TARGET_AVX10_2_256" +{ + emit_insn (gen_vdpphps_<mode>_maskz_1 (operands[0], operands[1], + operands[2], operands[3], CONST0_RTX(<MODE>mode), operands[4])); + DONE; +}) + +(define_insn "vdpphps_<mode>_maskz_1" + [(set (match_operand:VF1_AVX10_2 0 "register_operand" "=v") + (vec_merge:VF1_AVX10_2 + (unspec:VF1_AVX10_2 + [(match_operand:VF1_AVX10_2 1 "register_operand" "0") + (match_operand:VF1_AVX10_2 2 "register_operand" "v") + (match_operand:VF1_AVX10_2 3 "nonimmediate_operand" "vm")] + UNSPEC_VDPPHPS) + (match_operand:VF1_AVX10_2 4 "const0_operand" "C") + (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))] + "TARGET_AVX10_2_256" + "vdpphps\t{%3, %2, %0%{%5%}%N4|%0%{%5%}%N4, %2, %3}" + [(set_attr "prefix" "evex")]) diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c index f64d0c88264..5fc84234b57 100644 --- a/gcc/testsuite/gcc.target/i386/avx-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-1.c @@ -1002,6 +1002,14 @@ #define __builtin_ia32_subph256_mask_round(A, B, C, D, E) __builtin_ia32_subph256_mask_round(A, B, C, D, 8) #define __builtin_ia32_subps256_mask_round(A, B, C, D, E) __builtin_ia32_subps256_mask_round(A, B, C, D, 8) +/* avx10_2-512mediaintrin.h */ +#define __builtin_ia32_mpsadbw512(A, B, C) __builtin_ia32_mpsadbw512 (A, B, 1) +#define __builtin_ia32_mpsadbw512_mask(A, B, C, D, E) __builtin_ia32_mpsadbw512_mask (A, B, 1, D, E) + +/* avx10_2mediaintrin.h */ +#define __builtin_ia32_mpsadbw128_mask(A, B, C, D, E) __builtin_ia32_mpsadbw128_mask (A, B, 1, D, E) +#define __builtin_ia32_mpsadbw256_mask(A, B, C, D, E) __builtin_ia32_mpsadbw256_mask (A, B, 1, D, E) + #include <wmmintrin.h> #include <immintrin.h> #include <mm3dnow.h> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-media-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-media-1.c index d4145c41a99..00df32194e5 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-512-media-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-media-1.c @@ -18,11 +18,39 @@ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\\n\\r]*%zmm\[0-9\]+\[^\\n\\r\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ + #include <immintrin.h> +volatile __m512 a; +volatile __m512h b,c; volatile __m512i x,y,z,z1; volatile __mmask16 m16; +volatile __mmask32 m32; void avx10_2_512_test (void) { @@ -49,4 +77,36 @@ void avx10_2_512_test (void) x = _mm512_dpbuuds_epi32 (x, y, z); x = _mm512_mask_dpbuuds_epi32 (x, m16, y, z); x = _mm512_maskz_dpbuuds_epi32 (m16, x, y, z); + + x = _mm512_dpwsud_epi32 (x, y, z); + x = _mm512_mask_dpwsud_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwsud_epi32 (m16, x, y, z); + + x = _mm512_dpwsuds_epi32 (x, y, z); + x = _mm512_mask_dpwsuds_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwsuds_epi32 (m16, x, y, z); + + x = _mm512_dpwusd_epi32 (x, y, z); + x = _mm512_mask_dpwusd_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwusd_epi32 (m16, x, y, z); + + x = _mm512_dpwusds_epi32 (x, y, z); + x = _mm512_mask_dpwusds_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwusds_epi32 (m16, x, y, z); + + x = _mm512_dpwuud_epi32 (x, y, z); + x = _mm512_mask_dpwuud_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwuud_epi32 (m16, x, y, z); + + x = _mm512_dpwuuds_epi32 (x, y, z); + x = _mm512_mask_dpwuuds_epi32 (x, m16, y, z); + x = _mm512_maskz_dpwuuds_epi32 (m16, x, y, z); + + a = _mm512_dpph_ps (a, b, c); + a = _mm512_mask_dpph_ps (a, m16, b, c); + a = _mm512_maskz_dpph_ps (m16, a, b, c); + + x = _mm512_mpsadbw_epu8 (x, y, 1); + x = _mm512_mask_mpsadbw_epu8 (x, m32, y, z, 1); + x = _mm512_maskz_mpsadbw_epu8 (m32, x, y, 1); } diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vdpphps-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vdpphps-2.c new file mode 100644 index 00000000000..9b73a298fb9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vdpphps-2.c @@ -0,0 +1,71 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SRC_SIZE AVX512F_LEN / 16 +#define SIZE AVX512F_LEN / 32 + +static void +CALC (float *dest, _Float16 *src1, _Float16 *src2) +{ + int i; + + for (i = 0; i < SIZE; i++) + { + dest[i] += (float) src1[2 * i + 1] * (float) src2[2 * i + 1]; + dest[i] += (float) src1[2 * i] * (float) src2[2 * i]; + } +} + +void +TEST(void) +{ + UNION_TYPE (AVX512F_LEN, h) src1, src2; + UNION_TYPE (AVX512F_LEN,) res1, res2, res3; + MASK_TYPE mask = MASK_VALUE; + float res_ref[SIZE], res_ref2[SIZE], res_ref3[SIZE]; + + for (int i = 0; i < SRC_SIZE; i++) + { + src1.a[i] = (_Float16) (i * 4) + 1.25f16; + src2.a[i] = (_Float16) (i * 2) + 2.5f16; + } + + for (int i = 0; i < SIZE; i++) + { + res1.a[i] = 3.125f + 2 * i; + res_ref[i] = 3.125f + 2 * i; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + res_ref2[i] = DEFAULT_VALUE; + res_ref3[i] = DEFAULT_VALUE; + } + + res1.x = INTRINSIC (_dpph_ps) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpph_ps) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpph_ps) (mask, res3.x, src1.x, src2.x); + + CALC(res_ref, src1.a, src2.a); + CALC(res_ref2, src1.a, src2.a); + CALC(res_ref3, src1.a, src2.a); + + if (UNION_CHECK(AVX512F_LEN,) (res1, res_ref)) + abort (); + + MASK_MERGE () (res_ref2, mask, SIZE); + if (UNION_CHECK(AVX512F_LEN,) (res2, res_ref2)) + abort (); + + MASK_ZERO () (res_ref3, mask, SIZE); + if (UNION_CHECK(AVX512F_LEN,) (res3, res_ref3)) + abort (); +} + diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vmpsadbw-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vmpsadbw-2.c new file mode 100644 index 00000000000..3cedab490fa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vmpsadbw-2.c @@ -0,0 +1,93 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 8) +#define SIZE_RES (AVX512F_LEN / 16) + + +static void +CALC (short* dst, char* src1, char* src2, int cont) +{ + int blk2_pos, blk1_pos, i, j, k, c; + char blk1[12], blk2[4], x; + short tmp[4], s; + + for (k = 0; k < AVX512F_LEN / 128; k++) + { + c = cont & 0xff; + if (k % 2 == 1) + c >>= 3; + blk2_pos = (c & 3) * 4; + blk1_pos = ((c >> 2) & 1) * 4; + + for (i = 0; i < 11; i++) + blk1[i] = src1[16 * k + i + blk1_pos]; + + for (i = 0; i < 4; i++) + blk2[i] = src2[16 * k + i + blk2_pos]; + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 4; j++) + { + x = blk1[j + i] - blk2[j]; + tmp[j] = x > 0 ? x : -x; + } + + s = 0; + for (j = 0; j < 4; j++) + s += tmp[j]; + dst[8 * k + i] = s; + } + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_w) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_b) src1; + UNION_TYPE (AVX512F_LEN, i_b) src2; + MASK_TYPE mask = MASK_VALUE; + short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + src1.a[i] = 10 + 2 * i; + src2.a[i] = 3 * i; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, src1.a, src2.a, 0x21); + CALC (res_ref2, src1.a, src2.a, 0x21); + + res1.x = INTRINSIC (_mpsadbw_epu8) (src1.x, src2.x, 0x21); + res2.x = INTRINSIC (_mask_mpsadbw_epu8) (res2.x, mask, src1.x, src2.x, 0x21); + res3.x = INTRINSIC (_maskz_mpsadbw_epu8) (mask, src1.x, src2.x, 0x21); + + if (UNION_CHECK (AVX512F_LEN, i_w) (res1, res_ref)) + abort (); + + MASK_MERGE (i_w) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_w) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_w) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_w) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsud-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsud-2.c new file mode 100644 index 00000000000..1643f6f0803 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsud-2.c @@ -0,0 +1,71 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, short *s1, unsigned short *s2) +{ + int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (int) s1[i] * (unsigned int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_w) src1; + UNION_TYPE (AVX512F_LEN, i_uw) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + int sign = i % 2 ? 1 : -1; + src1.a[i] = sign * (10 + 3 * i * i); + src2.a[i] = sign * 10 * i * i; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwsud_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwsud_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwsud_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsuds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsuds-2.c new file mode 100644 index 00000000000..7c959119a2a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwsuds-2.c @@ -0,0 +1,74 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, short *s1, unsigned short *s2) +{ + int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (int) s1[i] * (unsigned int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + long long max_int = 0x7FFFFFFF; + if (test > max_int) + test = max_int; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_w) src1; + UNION_TYPE (AVX512F_LEN, i_uw) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + int sign = i % 2 ? 1 : -1; + src1.a[i] = sign * (10 + 3 * i * i); + src2.a[i] = sign * 10 * i * i; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwsuds_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwsuds_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwsuds_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusd-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusd-2.c new file mode 100644 index 00000000000..b780e41bfba --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusd-2.c @@ -0,0 +1,71 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, unsigned short *s1, short *s2) +{ + int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (unsigned int) s1[i] * (int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_uw) src1; + UNION_TYPE (AVX512F_LEN, i_w) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + int sign = i % 2 ? 1 : -1; + src1.a[i] = sign * 10 * i * i; + src2.a[i] = 10 + 3 * i * i + sign; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwusd_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwusd_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwusd_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusds-2.c new file mode 100644 index 00000000000..922d4b37ab8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwusds-2.c @@ -0,0 +1,74 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, unsigned short *s1, short *s2) +{ + int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (unsigned int) s1[i] * (int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + long long max_int = 0x7FFFFFFF; + if (test > max_int) + test = max_int; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_uw) src1; + UNION_TYPE (AVX512F_LEN, i_w) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + int sign = i % 2 ? 1 : -1; + src1.a[i] = sign * 10 * i * i; + src2.a[i] = 10 + 3 * i * i + sign; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwusds_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwusds_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwusds_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuud-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuud-2.c new file mode 100644 index 00000000000..d9f5dba8dff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuud-2.c @@ -0,0 +1,70 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, unsigned short *s1, unsigned short *s2) +{ + unsigned int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (unsigned int) s1[i] * (unsigned int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_uw) src1; + UNION_TYPE (AVX512F_LEN, i_uw) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + src1.a[i] = 10 + 3 * i * i; + src2.a[i] = 10 * i * i; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwuud_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwuud_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwuud_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuuds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuuds-2.c new file mode 100644 index 00000000000..da3c82bd4cc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vpdpwuuds-2.c @@ -0,0 +1,73 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif + +#include "avx10-helper.h" + +#define SIZE (AVX512F_LEN / 16) +#define SIZE_RES (AVX512F_LEN / 32) + + +static void +CALC (int *r, int *dst, unsigned short *s1, unsigned short *s2) +{ + unsigned int tempres[SIZE]; + for (int i = 0; i < SIZE; i++) + tempres[i] = (unsigned int) s1[i] * (unsigned int) s2[i]; + for (int i = 0; i < SIZE_RES; i++) + { + long long test = (long long) dst[i] + tempres[i * 2] + tempres[i * 2 + 1]; + long long max_uint = 0xFFFFFFFF; + if (test > max_uint) + test = max_uint; + r[i] = test; + } +} + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, i_d) res1, res2, res3; + UNION_TYPE (AVX512F_LEN, i_uw) src1; + UNION_TYPE (AVX512F_LEN, i_uw) src2; + MASK_TYPE mask = MASK_VALUE; + int res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE; i++) + { + src1.a[i] = 10 + 3 * i * i; + src2.a[i] = 10 * i * i; + } + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0x7FFFFFFF; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + } + + CALC (res_ref, res1.a, src1.a, src2.a); + CALC (res_ref2, res2.a, src1.a, src2.a); + + res1.x = INTRINSIC (_dpwuuds_epi32) (res1.x, src1.x, src2.x); + res2.x = INTRINSIC (_mask_dpwuuds_epi32) (res2.x, mask, src1.x, src2.x); + res3.x = INTRINSIC (_maskz_dpwuuds_epi32) (mask, res3.x, src1.x, src2.x); + + if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref)) + abort (); + + MASK_MERGE (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref2)) + abort (); + + MASK_ZERO (i_d) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-builtin-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-builtin-2.c new file mode 100644 index 00000000000..521768e92b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-builtin-2.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx10.2 -mno-avxvnniint16" } */ +typedef int v8si __attribute__ ((vector_size (32))); +v8si +foo (v8si a, v8si b, v8si c) +{ + return __builtin_ia32_vpdpwsud256 (a, b, c); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-media-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-media-1.c index c2b3e5527d9..1be3605b81c 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-media-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-media-1.c @@ -36,11 +36,62 @@ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vpdpbuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpphps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\\n\\r]*%ymm\[0-9\]+\[^\\n\\r\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmpsadbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\\n\\r]*%xmm\[0-9\]+\[^\\n\\r\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include <immintrin.h> +volatile __m256 a; +volatile __m256h b,c; volatile __m256i x,y,z; +volatile __m128 a_; +volatile __m128h b_,c_; volatile __m128i x_,y_,z_; +volatile __mmask16 m16; volatile __mmask8 m; void extern @@ -93,4 +144,65 @@ avx10_2_test (void) x_ = _mm_dpbuuds_epi32 (x_, y_, z_); x_ = _mm_mask_dpbuuds_epi32 (x_, m, y_, z_); x_ = _mm_maskz_dpbuuds_epi32 (m, x_, y_, z_); + + x = _mm256_dpwsud_epi32 (x, y, z); + x = _mm256_mask_dpwsud_epi32 (x, m, y, z); + x = _mm256_maskz_dpwsud_epi32 (m, x, y, z); + + x_ = _mm_dpwsud_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwsud_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwsud_epi32 (m, x_, y_, z_); + + x = _mm256_dpwsuds_epi32 (x, y, z); + x = _mm256_mask_dpwsuds_epi32 (x, m, y, z); + x = _mm256_maskz_dpwsuds_epi32 (m, x, y, z); + + x_ = _mm_dpwsuds_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwsuds_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwsuds_epi32 (m, x_, y_, z_); + + x = _mm256_dpwusd_epi32 (x, y, z); + x = _mm256_mask_dpwusd_epi32 (x, m, y, z); + x = _mm256_maskz_dpwusd_epi32 (m, x, y, z); + + x_ = _mm_dpwusd_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwusd_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwusd_epi32 (m, x_, y_, z_); + + x = _mm256_dpwusds_epi32 (x, y, z); + x = _mm256_mask_dpwusds_epi32 (x, m, y, z); + x = _mm256_maskz_dpwusds_epi32 (m, x, y, z); + + x_ = _mm_dpwusds_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwusds_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwusds_epi32 (m, x_, y_, z_); + + x = _mm256_dpwuud_epi32 (x, y, z); + x = _mm256_mask_dpwuud_epi32 (x, m, y, z); + x = _mm256_maskz_dpwuud_epi32 (m, x, y, z); + + x_ = _mm_dpwuud_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwuud_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwuud_epi32 (m, x_, y_, z_); + + x = _mm256_dpwuuds_epi32 (x, y, z); + x = _mm256_mask_dpwuuds_epi32 (x, m, y, z); + x = _mm256_maskz_dpwuuds_epi32 (m, x, y, z); + + x_ = _mm_dpwuuds_epi32 (x_, y_, z_); + x_ = _mm_mask_dpwuuds_epi32 (x_, m, y_, z_); + x_ = _mm_maskz_dpwuuds_epi32 (m, x_, y_, z_); + + a = _mm256_dpph_ps (a, b, c); + a = _mm256_mask_dpph_ps (a, m, b, c); + a = _mm256_maskz_dpph_ps (m, a, b, c); + + a_ = _mm_dpph_ps (a_, b_, c_); + a_ = _mm_mask_dpph_ps (a_, m, b_, c_); + a_ = _mm_maskz_dpph_ps (m, a_, b_, c_); + + x = _mm256_mask_mpsadbw_epu8 (x, m16, y, z, 1); + x = _mm256_maskz_mpsadbw_epu8 (m16, x, y, 1); + x_ = _mm_mask_mpsadbw_epu8 (x_, m, y_, z_, 1); + x_ = _mm_maskz_mpsadbw_epu8 (m, x_, y_, 1); } diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vdpphps-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vdpphps-2.c new file mode 100644 index 00000000000..26d98b70590 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vdpphps-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vdpphps-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vdpphps-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmpsadbw-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vmpsadbw-2.c new file mode 100644 index 00000000000..746ea7baacb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmpsadbw-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vmpsadbw-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vmpsadbw-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsud-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsud-2.c new file mode 100644 index 00000000000..e1c7a81b54f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsud-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwsud-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwsud-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsuds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsuds-2.c new file mode 100644 index 00000000000..d046fd8747a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsuds-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwsuds-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwsuds-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusd-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusd-2.c new file mode 100644 index 00000000000..5a8af9b8728 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusd-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwusd-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwusd-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusds-2.c new file mode 100644 index 00000000000..88d877f381a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusds-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwusds-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwusds-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuud-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuud-2.c new file mode 100644 index 00000000000..aaefe02d29d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuud-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwuud-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwuud-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuuds-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuuds-2.c new file mode 100644 index 00000000000..6a61112e161 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuuds-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwuuds-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vpdpwuuds-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avxvnniint16-1.c b/gcc/testsuite/gcc.target/i386/avxvnniint16-1.c index 6ae57b150fe..5a093c97351 100644 --- a/gcc/testsuite/gcc.target/i386/avxvnniint16-1.c +++ b/gcc/testsuite/gcc.target/i386/avxvnniint16-1.c @@ -1,17 +1,17 @@ /* { dg-do compile } */ /* { dg-options "-mavxvnniint16 -O2" } */ -/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwusd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwusds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwsud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwsuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwuud\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpdpwuuds\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ #include <immintrin.h> @@ -40,4 +40,22 @@ avxvnniint16_test (void) x = _mm256_dpwuuds_avx_epi32 (x, y, z); x_ = _mm_dpwuuds_avx_epi32 (x_, y_, z_); + + x = _mm256_dpwusd_epi32 (x, y, z); + x_ = _mm_dpwusd_epi32 (x_, y_, z_); + + x = _mm256_dpwusds_epi32 (x, y, z); + x_ = _mm_dpwusds_epi32 (x_, y_, z_); + + x = _mm256_dpwsud_epi32 (x, y, z); + x_ = _mm_dpwsud_epi32 (x_, y_, z_); + + x = _mm256_dpwsuds_epi32 (x, y, z); + x_ = _mm_dpwsuds_epi32 (x_, y_, z_); + + x = _mm256_dpwuud_epi32 (x, y, z); + x_ = _mm_dpwuud_epi32 (x_, y_, z_); + + x = _mm256_dpwuuds_epi32 (x, y, z); + x_ = _mm_dpwuuds_epi32 (x_, y_, z_); } diff --git a/gcc/testsuite/gcc.target/i386/avxvnniint16-builtin.c b/gcc/testsuite/gcc.target/i386/avxvnniint16-builtin.c new file mode 100644 index 00000000000..10e9b643920 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avxvnniint16-builtin.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavxvnniint16 -mno-avx10.2" } */ +typedef int v8si __attribute__ ((vector_size (32))); +v8si +foo (v8si a, v8si b, v8si c) +{ + return __builtin_ia32_vpdpwsud256 (a, b, c); +} diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index a5b1775ed2d..6b1c9e545f0 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -1010,4 +1010,12 @@ #define __builtin_ia32_subph256_mask_round(A, B, C, D, E) __builtin_ia32_subph256_mask_round(A, B, C, D, 8) #define __builtin_ia32_subps256_mask_round(A, B, C, D, E) __builtin_ia32_subps256_mask_round(A, B, C, D, 8) +/* avx10_2-512mediaintrin.h */ +#define __builtin_ia32_mpsadbw512(A, B, C) __builtin_ia32_mpsadbw512 (A, B, 1) +#define __builtin_ia32_mpsadbw512_mask(A, B, C, D, E) __builtin_ia32_mpsadbw512_mask (A, B, 1, D, E) + +/* avx10_2mediaintrin.h */ +#define __builtin_ia32_mpsadbw128_mask(A, B, C, D, E) __builtin_ia32_mpsadbw128_mask (A, B, 1, D, E) +#define __builtin_ia32_mpsadbw256_mask(A, B, C, D, E) __builtin_ia32_mpsadbw256_mask (A, B, 1, D, E) + #include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 4736b2a5d52..6dfdaa96c76 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -1371,3 +1371,14 @@ test_4x (_mm256_mask_fixupimm_round_pd, __m256d, __m256d, __mmask8, __m256d, __m test_4x (_mm256_mask_fixupimm_round_ps, __m256, __m256, __mmask8, __m256, __m256i, 3, 8) test_4x (_mm256_mask_range_round_pd, __m256d, __m256d, __mmask8, __m256d, __m256d, 15, 8) test_4x (_mm256_mask_range_round_ps, __m256, __m256, __mmask8, __m256, __m256, 15, 8) + +/* avx10_2-512mediaintrin.h */ +test_2 (_mm512_mpsadbw_epu8, __m512i, __m512i, __m512i, 1) +test_3 (_mm512_maskz_mpsadbw_epu8, __m512i, __mmask32, __m512i, __m512i, 1) +test_4 (_mm512_mask_mpsadbw_epu8, __m512i, __m512i, __mmask32, __m512i, __m512i, 1) + +/* avx10_2mediaintrin.h */ +test_3 (_mm_maskz_mpsadbw_epu8, __m128i, __mmask8, __m128i, __m128i, 1) +test_3 (_mm256_maskz_mpsadbw_epu8, __m256i, __mmask16, __m256i, __m256i, 1) +test_4 (_mm_mask_mpsadbw_epu8, __m128i, __m128i, __mmask8, __m128i, __m128i, 1) +test_4 (_mm256_mask_mpsadbw_epu8, __m256i, __m256i, __mmask16, __m256i, __m256i, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index 5bfccd52630..102b6b878c8 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -1410,3 +1410,14 @@ test_4x (_mm256_mask_fixupimm_round_pd, __m256d, __m256d, __mmask8, __m256d, __m test_4x (_mm256_mask_fixupimm_round_ps, __m256, __m256, __mmask8, __m256, __m256i, 3, 8) test_4x (_mm256_mask_range_round_pd, __m256d, __m256d, __mmask8, __m256d, __m256d, 15, 8) test_4x (_mm256_mask_range_round_ps, __m256, __m256, __mmask8, __m256, __m256, 15, 8) + +/* avx10_2-512mediaintrin.h */ +test_2 (_mm512_mpsadbw_epu8, __m512i, __m512i, __m512i, 1) +test_3 (_mm512_maskz_mpsadbw_epu8, __m512i, __mmask32, __m512i, __m512i, 1) +test_4 (_mm512_mask_mpsadbw_epu8, __m512i, __m512i, __mmask32, __m512i, __m512i, 1) + +/* avx10_2mediaintrin.h */ +test_3 (_mm_maskz_mpsadbw_epu8, __m128i, __mmask8, __m128i, __m128i, 1) +test_3 (_mm256_maskz_mpsadbw_epu8, __m256i, __mmask16, __m256i, __m256i, 1) +test_4 (_mm_mask_mpsadbw_epu8, __m128i, __m128i, __mmask8, __m128i, __m128i, 1) +test_4 (_mm256_mask_mpsadbw_epu8, __m256i, __m256i, __mmask16, __m256i, __m256i, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index e63c100f452..962b9507283 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -984,6 +984,14 @@ #define __builtin_ia32_subph256_mask_round(A, B, C, D, E) __builtin_ia32_subph256_mask_round(A, B, C, D, 8) #define __builtin_ia32_subps256_mask_round(A, B, C, D, E) __builtin_ia32_subps256_mask_round(A, B, C, D, 8) +/* avx10_2-512mediaintrin.h */ +#define __builtin_ia32_mpsadbw512(A, B, C) __builtin_ia32_mpsadbw512 (A, B, 1) +#define __builtin_ia32_mpsadbw512_mask(A, B, C, D, E) __builtin_ia32_mpsadbw512_mask (A, B, 1, D, E) + +/* avx10_2-mediaintrin.h */ +#define __builtin_ia32_mpsadbw128_mask(A, B, C, D, E) __builtin_ia32_mpsadbw128_mask (A, B, 1, D, E) +#define __builtin_ia32_mpsadbw256_mask(A, B, C, D, E) __builtin_ia32_mpsadbw256_mask (A, B, 1, D, E) + #pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512") #include <x86intrin.h> -- 2.43.5