Hi!

The following patch adds a couple of intrinsics that both ICC and clang
have, but GCC doesn't.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

They emit optimal code except for the last one, _mm256_storeu2_m128i,
where we emit
        vmovups %xmm0, (%rsi)
        vextractf128    $0x1, %ymm0, %xmm0
        vmovups %xmm0, (%rdi)
instead of
        vmovups %xmm0, (%rsi)
        vextractf128    $0x1, %ymm0, (%rdi)
That is because for _mm256_extractf128_si256 is implemented as V8SImode
pattern, but _m128i is V2DImode, and we don't have a pattern like:
        (set (match_operand:V2DI 0 ("nonimmediate_operand") ("=xm, vm"))
          (subreg:V2DI
            (vec_select:V4SI (match_operand:V8SI 1 ("register_operand") ("x, 
v"))
                (parallel [
                        (const_int 4 [0x4])
                        (const_int 5 [0x5])
                        (const_int 6 [0x6])
                        (const_int 7 [0x7])
                    ])) 0))
Shall we add that (and just for this mode combination, or using iterators
for others)?  Unfortunately the builtin that would use V2DI in the
vec_select instead of V4SI is AVX2 and so can't be used in this case.

2019-08-05  Jakub Jelinek  <ja...@redhat.com>

        PR target/91341
        * config/i386/avxintrin.h (_mm256_loadu2_m128, _mm256_storeu2_m128,
        _mm256_loadu2_m128d, _mm256_storeu2_m128d, _mm256_loadu2_m128i,
        _mm256_storeu2_m128i): New function.

        * gcc.target/i386/avx-loadu2-m128-1.c: New test.
        * gcc.target/i386/avx-loadu2-m128-2.c: New test.
        * gcc.target/i386/avx-loadu2-m128d-1.c: New test.
        * gcc.target/i386/avx-loadu2-m128d-2.c: New test.
        * gcc.target/i386/avx-loadu2-m128i-1.c: New test.
        * gcc.target/i386/avx-loadu2-m128i-2.c: New test.
        * gcc.target/i386/avx-storeu2-m128-1.c: New test.
        * gcc.target/i386/avx-storeu2-m128-2.c: New test.
        * gcc.target/i386/avx-storeu2-m128d-1.c: New test.
        * gcc.target/i386/avx-storeu2-m128d-2.c: New test.
        * gcc.target/i386/avx-storeu2-m128i-1.c: New test.
        * gcc.target/i386/avx-storeu2-m128i-2.c: New test.

--- gcc/config/i386/avxintrin.h.jj      2019-01-01 12:37:32.417724576 +0100
+++ gcc/config/i386/avxintrin.h 2019-08-04 16:39:10.091659072 +0200
@@ -1520,6 +1520,48 @@ _mm256_setr_m128i (__m128i __L, __m128i
   return _mm256_set_m128i (__H, __L);
 }
 
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_loadu2_m128 (float const *__PH, float const *__PL)
+{
+  return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
+                              _mm_loadu_ps (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
+{
+  _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
+  _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_loadu2_m128d (double const *__PH, double const *__PL)
+{
+  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
+                              _mm_loadu_pd (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
+{
+  _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
+  _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
+{
+  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 
(__PL)),
+                                 _mm_loadu_si128 (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
+{
+  _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
+  _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
+}
+
 #ifdef __DISABLE_AVX__
 #undef __DISABLE_AVX__
 #pragma GCC pop_options
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c.jj        2019-08-04 
16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c   2019-08-04 
16:50:01.315810000 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovups\t" } } */
+/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
+
+#include <immintrin.h>
+
+__m256
+foo (float const *hi, float const *lo)
+{
+  return _mm256_loadu2_m128 (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c.jj        2019-08-04 
16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c   2019-08-04 
16:59:50.002899417 +0200
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256 u;
+  float e[8] = { 1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f };
+  float f[8] = { -24.75f, -18.75f, 12.0f, 0.0f, -9.5f, 13.25f, -24.75f, 
-18.75f };
+
+  u.x = _mm256_loadu2_m128 (e + 1, e + 3);
+  if (check_union256 (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c.jj       2019-08-04 
16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c  2019-08-04 
17:03:13.548818465 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovupd\t" } } */
+/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
+
+#include <immintrin.h>
+
+__m256d
+foo (double const *hi, double const *lo)
+{
+  return _mm256_loadu2_m128d (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c.jj       2019-08-04 
16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c  2019-08-04 
17:05:00.342201999 +0200
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256d u;
+  double e[8] = { 1.5, -9.5, 13.25, -24.75, -18.75, 12.0, 0.0, 9.0 };
+  double f[4] = { 12.0, 0.0, -9.5, 13.25 };
+
+  u.x = _mm256_loadu2_m128d (e + 1, e + 5);
+  if (check_union256d (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c.jj       2019-08-04 
16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c  2019-08-04 
17:06:44.386628690 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
+/* { dg-final { scan-assembler "\tvinsert\[fi]128\t" } } */
+
+#include <immintrin.h>
+
+__m256i
+foo (__m128i_u const *hi, __m128i_u const *lo)
+{
+  return _mm256_loadu2_m128i (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c.jj       2019-08-04 
16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c  2019-08-04 
17:11:04.864691481 +0200
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256i_d u;
+  int e[8] = { 1, -9, 13, -24, -18, 12, 0, 9 };
+  int f[8] = { -24, -18, 12, 0, -9, 13, -24, -18 };
+
+  u.x = _mm256_loadu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 3));
+  if (check_union256i_d (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c.jj       2019-08-04 
17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c  2019-08-04 
17:15:14.546917455 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovups\t" } } */
+/* { dg-final { scan-assembler "\tvextractf128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (float *hi, float *lo, __m256 a)
+{
+  _mm256_storeu2_m128 (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c.jj       2019-08-04 
17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c  2019-08-04 
17:19:36.590956577 +0200
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  float e[12] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 
-1.0f, -1.0f, -1.0f, -1.0f };
+  float f[12] = { -1.0f, -18.75f, 12.0f, 0.0f, 9.0f, -1.0f, 1.5f, -9.5f, 
13.25f, -24.75f, -1.0f, -1.0f };
+  int i;
+  __m256 x = _mm256_set_ps (1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 
0.0f, 9.0f);
+  _mm256_storeu2_m128 (e + 1, e + 6, x);
+  for (i = 0; i < 12; i++)
+    if (e[i] != f[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c.jj      2019-08-04 
17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c 2019-08-04 
17:34:55.951056592 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovup\[sd]\t" } } */
+/* { dg-final { scan-assembler "\tvextractf128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (double *hi, double *lo, __m256d a)
+{
+  _mm256_storeu2_m128d (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c.jj      2019-08-04 
17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c 2019-08-04 
17:35:17.505730678 +0200
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  double e[8] = { -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0 };
+  double f[8] = { -1.0, 13.25, -24.75, -1.0, 1.5, -9.5, -1.0, -1.0 };
+  int i;
+  __m256d x = _mm256_set_pd (1.5, -9.5, 13.25, -24.75);
+  _mm256_storeu2_m128d (e + 1, e + 4, x);
+  for (i = 0; i < 8; i++)
+    if (e[i] != f[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c.jj      2019-08-04 
17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c 2019-08-04 
17:42:55.207811439 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmov(dqu|ups)\t" } } */
+/* { dg-final { scan-assembler "\tvextract\[if]128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (__m128i_u *hi, __m128i_u *lo, __m256i a)
+{
+  _mm256_storeu2_m128i (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c.jj      2019-08-04 
17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c 2019-08-04 
17:43:30.488278278 +0200
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  int e[12] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  int f[12] = { -1, -18, 12, 0, 9, -1, 1, -9, 13, -24, -1, -1 };
+  int i;
+  __m256i x = _mm256_set_epi32 (1, -9, 13, -24, -18, 12, 0, 9);
+  _mm256_storeu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 6), x);
+  for (i = 0; i < 12; i++)
+    if (e[i] != f[i])
+      abort ();
+}

        Jakub

Reply via email to