Do you have any advice? BRs, Lin
-----Original Message----- From: Hu, Lin1 <lin1...@intel.com> Sent: Wednesday, May 8, 2024 9:38 AM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao <hongtao....@intel.com>; ubiz...@gmail.com Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float. Hi, all This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di. The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. (expand_vector_conversion_no_vec_pack): Check if can convert int <-> int, float <-> float and int <-> float, directly. Support indirect convert, when direct optab is not supported. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++ gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 +++++ gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 +++++ gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++++++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++ gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++++++++++++++ gcc/tree-vect-generic.cc | 107 +++++++++- 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 00000000000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi +__attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) { + return __builtin_convertvector((__v2di)a, __v2si); } + +__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); } + +__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); } + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); } + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); } + +__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); } + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); } + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); } + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); } + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); } + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); } + +__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); } + +__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); } + +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2qi); } + +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4qi); } + +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8si)a, __v8qi); } + +__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); } + +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) +{ + return __builtin_convertvector((__v2hi)a, __v2qi); } + +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hi)a, __v8qi); } + +__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); } + +__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); } + +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) { + return __builtin_convertvector((__v2du)a, __v2su); } + +__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4du)a, __v4su); } + +__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8du)a, __v8su); } + +__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2hu); } + +__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4hu); } + +__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); } + +__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2qu); } + +__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4qu); } + +__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8du)a, __v8qu); } + +__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2hu); } + +__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4hu); } + +__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); } + +__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); } + +__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2qu); } + +__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4qu); } + +__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8su)a, __v8qu); } + +__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); } + +__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) +{ + return __builtin_convertvector((__v2hu)a, __v2qu); } + +__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hu)a, __v8qu); } + +__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); } + +__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c new file mode 100644 index 00000000000..02ffd811cb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c @@ -0,0 +1,105 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi +__attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); + +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) { + return __builtin_convertvector(a, __v2di); } + +__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); } + +__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); } + +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) { + return __builtin_convertvector(a, __v2di); } + +__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); } + +__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); } + +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) { + return __builtin_convertvector(a, __v2di); } + +__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); } + +__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); } + +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) { + return (__m128i)__builtin_convertvector(a, __v4si); } + +__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); } + +__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); } + +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) { + return (__m128i)__builtin_convertvector(a, __v4si); } + +__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); } + +__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); } + +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) { + return (__m128i)__builtin_convertvector(a, __v8hi); } + +__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a) +{ + return (__m256i)__builtin_convertvector(a, __v16hi); } + +__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector(a, __v32hi); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c new file mode 100644 index 00000000000..30dc947b6dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef +_Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) { + return __builtin_convertvector(a, __v2sf); } + +__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4sf); } + +__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8sf); } + +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) { + return __builtin_convertvector(a, __v2hf); } + +__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4hf); } + +__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8hf); } + +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) { + return __builtin_convertvector(a, __v4hf); } + +__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8hf); } + +__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector(a, __v16hf); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c new file mode 100644 index 00000000000..e537e7349e4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef +_Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) { + return __builtin_convertvector(a, __v2df); } + +__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4df); } + +__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8df); } + +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) { + return __builtin_convertvector(a, __v2df); } + +__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4df); } + +__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8df); } + +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) { + return __builtin_convertvector(a, __v4sf); } + +__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8sf); } + +__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16sf); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c new file mode 100644 index 00000000000..5a44ef9f3b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" +} */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef +_Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) { + return __builtin_convertvector(a, __v2si); } + +__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4si); } + +__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8si); } + +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) { + return __builtin_convertvector(a, __v2di); } + +__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4di); } + +__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8di); } + +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) { + return __builtin_convertvector(a, __v4si); } + +__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8si); } + +__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16si); } + +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) { + return __builtin_convertvector(a, __v2di); } + +__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4di); } + +__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8di); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c new file mode 100644 index 00000000000..4a68a10b089 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c @@ -0,0 +1,139 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq +-fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 +} } } } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 +} } } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); typedef char __v16qi +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf +__attribute__ ((__vector_size__ (16))); + +__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qi); } + +__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qi); } + +__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qi); } + +__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qu); } + +__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qu); } + +__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qu); } + +__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qi); } + +__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qi); } + +__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qi); } + +__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qi); } + +__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qu); } + +__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qu); } + +__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qu); } + +__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qu); } + +__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qi); } + +__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qi); } + +__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qi); } + +__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qi); } + +__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qu); } + +__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qu); } + +__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qu); } + +__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qu); } diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c new file mode 100644 index 00000000000..0ff5a97ed1a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c @@ -0,0 +1,156 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq +-fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } +} } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } +} } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char +__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); typedef char __v16qi +__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu +__attribute__ ((vector_size (2))); typedef unsigned char __v4qu +__attribute__ ((vector_size (4))); typedef unsigned char __v8qu +__attribute__ ((vector_size (8))); typedef unsigned char __v16qu +__attribute__ ((vector_size (16))); typedef _Float16 __v2hf +__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf +__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf +__attribute__ ((__vector_size__ (16))); + +__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2df); } + +__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4df); } + +__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8df); } + +__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2df); } + +__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4df); } + +__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8df); } + +__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2sf); } + +__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4sf); } + +__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8sf); } + +__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16sf); } + +__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2sf); } + +__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4sf); } + +__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8sf); } + +__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16sf); } + +__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2hf); } + +__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4hf); } + +__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8hf); } + +__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16hf); } + +__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector((__v32qi)a, __v32hf); } + +__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2hf); } + +__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4hf); } + +__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8hf); } + +__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16hf); } + +__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a) +{ + return __builtin_convertvector((__v32qu)a, __v32hf); } diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ab640096ca2..e14fac9f179 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see #include "gimple-match.h" #include "recog.h" /* FIXME: for insn_data */ #include "optabs-libfuncs.h" +#include "cfgloop.h" +#include "tree-vectorizer.h" /* Build a ternary operation and gimplify it. Emit code before GSI. @@ -1834,6 +1836,102 @@ do_vec_narrow_conversion (gimple_stmt_iterator *gsi, tree inner_type, tree a, return gimplify_build2 (gsi, code, outer_type, b, c); } +/* A subroutine of expand_vector_conversion, support indirect conversion for + float <-> int, like char -> double. */ bool +expand_vector_conversion_no_vec_pack (gimple_stmt_iterator *gsi, + enum tree_code code, + tree lhs, + tree arg) +{ + gimple *g; + tree ret_type = TREE_TYPE (lhs); + tree arg_type = TREE_TYPE (arg); + tree new_rhs; + enum {NARROW, NONE, WIDEN} modifier = NONE; + enum tree_code code1 = ERROR_MARK; + enum tree_code codecvt1 = ERROR_MARK; + bool float_expr_p = code == FLOAT_EXPR; + + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) + { + g = gimple_build_assign (lhs, code1, arg); + gsi_replace (gsi, g, false); + return true; + } + + unsigned int ret_elt_bits = vector_element_bits (ret_type); unsigned + int arg_elt_bits = vector_element_bits (arg_type); if (ret_elt_bits < + arg_elt_bits) + modifier = NARROW; + else if (ret_elt_bits > arg_elt_bits) + modifier = WIDEN; + + if (((code == FIX_TRUNC_EXPR && !flag_trapping_math && modifier == NARROW) + || (code == FLOAT_EXPR && modifier == WIDEN))) + { + unsigned short target_size; + scalar_mode tmp_cvt_mode; + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); + tree cvt_type = NULL_TREE; + if (modifier == NARROW) + { + tmp_cvt_mode = lhs_mode; + target_size = GET_MODE_SIZE (rhs_mode); + } + else + { + target_size = GET_MODE_SIZE (lhs_mode); + int rhs_size = GET_MODE_BITSIZE (rhs_mode); + if (!int_mode_for_size (rhs_size, 0).exists (&tmp_cvt_mode)) + return false; + } + + code1 = float_expr_p ? code : NOP_EXPR; + codecvt1 = float_expr_p ? NOP_EXPR : code; + opt_scalar_mode mode_iter; + enum tree_code tc1, tc2; + unsigned HOST_WIDE_INT nelts + = constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); + + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) + { + tmp_cvt_mode = mode_iter.require (); + + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) + break; + + scalar_mode cvt_mode; + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); + if (!int_mode_for_size (tmp_cvt_size, 0).exists (&cvt_mode)) + break; + + int cvt_size = GET_MODE_BITSIZE (cvt_mode); + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED (arg_type); + cvt_type = build_nonstandard_integer_type (cvt_size, isUnsigned); + + cvt_type = build_vector_type (cvt_type, nelts); + if (cvt_type == NULL_TREE + || !supportable_convert_operation ((tree_code) code1, + ret_type, + cvt_type, &tc1) + || !supportable_convert_operation ((tree_code) codecvt1, + cvt_type, + arg_type, &tc2)) + continue; + + new_rhs = make_ssa_name (cvt_type); + g = vect_gimple_build (new_rhs, tc2, arg); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, tc1, new_rhs); + gsi_replace (gsi, g, false); + return true; + } + } + return false; +} + /* Expand VEC_CONVERT ifn call. */ static void @@ -1871,14 +1969,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi) else if (ret_elt_bits > arg_elt_bits) modifier = WIDEN; + if (expand_vector_conversion_no_vec_pack(gsi, code, lhs, arg)) + return; + if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)) { - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) - { - g = gimple_build_assign (lhs, code1, arg); - gsi_replace (gsi, g, false); - return; - } /* Can't use get_compute_type here, as supportable_convert_operation doesn't necessarily use an optab and needs two arguments. */ tree vec_compute_type -- 2.31.1