Hi! This patch adds vec_pack_sfix_trunc_v4df and vec_pack_sfix_v4df expanders, which allows e.g. f1 in the testcases to be vectorized using 32-byte vectors, even with -mavx.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2011-10-27 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (avx_cvtpd2dq256_2, avx_cvttpd2dq256_2, vec_pack_sfix_trunc_v4df, vec_pack_sfix_v4df): New expanders. (*avx_cvtpd2dq256_2, *avx_cvttpd2dq256_2): New insns. * gcc.target/i386/sse2-cvt-1.c: New test. * gcc.target/i386/sse2-cvt-2.c: New test. * gcc.target/i386/avx-cvt-1.c: New test. * gcc.target/i386/avx-cvt-2.c: New test. * gcc.target/i386/avx2-cvt-1.c: New test. * gcc.target/i386/avx2-cvt-2.c: New test. --- gcc/config/i386/sse.md.jj 2011-10-27 08:42:58.000000000 +0200 +++ gcc/config/i386/sse.md 2011-10-27 14:45:16.000000000 +0200 @@ -2544,6 +2548,27 @@ (define_insn "avx_cvtpd2dq256" (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "avx_cvtpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "") + (vec_concat:V8SI + (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "")] + UNSPEC_FIX_NOTRUNC) + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = CONST0_RTX (V4SImode);") + +(define_insn "*avx_cvtpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC) + (match_operand:V4SI 2 "const0_operand" "")))] + "TARGET_AVX" + "vcvtpd2dq{y}\t{%1, %x0|%x0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "sse2_cvtpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") (vec_concat:V4SI @@ -2584,6 +2609,25 @@ (define_insn "avx_cvttpd2dq256" (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "avx_cvttpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "") + (vec_concat:V8SI + (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "")) + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = CONST0_RTX (V4SImode);") + +(define_insn "*avx_cvttpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")) + (match_operand:V4SI 2 "const0_operand" "")))] + "TARGET_AVX" + "vcvttpd2dq{y}\t{%1, %x0|%x0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") (vec_concat:V4SI @@ -3027,6 +3075,23 @@ (define_expand "vec_pack_trunc_v2df" DONE; }) +(define_expand "vec_pack_sfix_trunc_v4df" + [(match_operand:V8SI 0 "register_operand" "") + (match_operand:V4DF 1 "nonimmediate_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")] + "TARGET_AVX" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V8SImode); + r2 = gen_reg_rtx (V8SImode); + + emit_insn (gen_avx_cvttpd2dq256_2 (r1, operands[1])); + emit_insn (gen_avx_cvttpd2dq256_2 (r2, operands[2])); + emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20))); + DONE; +}) + (define_expand "vec_pack_sfix_trunc_v2df" [(match_operand:V4SI 0 "register_operand" "") (match_operand:V2DF 1 "nonimmediate_operand" "") @@ -3046,6 +3111,23 @@ (define_expand "vec_pack_sfix_trunc_v2df DONE; }) +(define_expand "vec_pack_sfix_v4df" + [(match_operand:V8SI 0 "register_operand" "") + (match_operand:V4DF 1 "nonimmediate_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")] + "TARGET_AVX" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V8SImode); + r2 = gen_reg_rtx (V8SImode); + + emit_insn (gen_avx_cvtpd2dq256_2 (r1, operands[1])); + emit_insn (gen_avx_cvtpd2dq256_2 (r2, operands[2])); + emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20))); + DONE; +}) + (define_expand "vec_pack_sfix_v2df" [(match_operand:V4SI 0 "register_operand" "") (match_operand:V2DF 1 "nonimmediate_operand" "") --- gcc/testsuite/gcc.target/i386/sse2-cvt-1.c.jj 2011-10-27 13:48:44.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-cvt-1.c 2011-10-27 14:10:42.000000000 +0200 @@ -0,0 +1,111 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -msse2 -mno-avx" } */ +/* { dg-require-effective-target sse2 } */ + +#ifndef CHECK_H +#define CHECK_H "sse2-check.h" +#endif + +#ifndef TEST +#define TEST sse2_test +#endif + +#include CHECK_H + +#define N 16 +float f[N]; +double d[N]; +int n[N]; + +__attribute__((noinline)) void +f1 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = d[i]; +} + +__attribute__((noinline)) void +f2 (void) +{ + int i; + for (i = 0; i < N; i++) + f[i] = n[i]; +} + +__attribute__((noinline)) void +f3 (void) +{ + int i; + for (i = 0; i < N; i++) + d[i] = f[i]; +} + +__attribute__((noinline)) void +f4 (void) +{ + int i; + for (i = 0; i < N; i++) + n[i] = f[i]; +} + +__attribute__((noinline)) void +f5 (void) +{ + int i; + for (i = 0; i < N; i++) + d[i] = n[i]; +} + +__attribute__((noinline)) void +f6 (void) +{ + int i; + for (i = 0; i < N; i++) + f[i] = d[i]; +} + +static void +TEST () +{ + int i; + for (i = 0; i < N; i++) + { + asm (""); + d[i] = i + 2.5; + } + f1 (); + for (i = 0; i < N; i++) + if (n[i] != i + 2) + abort (); + else + n[i] = i + 7; + f2 (); + for (i = 0; i < N; i++) + if (f[i] != i + 7) + abort (); + else + f[i] = i - 2.25f; + f3 (); + for (i = 0; i < N; i++) + if (d[i] != i - 2.25) + abort (); + else + f[i] = i + 3.5; + f4 (); + for (i = 0; i < N; i++) + if (n[i] != i + 3) + abort (); + else + n[i] = i + 9; + f5 (); + for (i = 0; i < N; i++) + if (d[i] != i + 9) + abort (); + else + d[i] = i - 7.25; + f6 (); + for (i = 0; i < N; i++) + if (f[i] != i - 7.25) + abort (); +} --- gcc/testsuite/gcc.target/i386/sse2-cvt-2.c.jj 2011-10-27 14:12:38.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-cvt-2.c 2011-10-27 14:14:05.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -msse2 -mno-sse3 -fdump-tree-vect-details" } */ + +#include "sse2-cvt-1.c" + +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 6 "vect" } } */ +/* { dg-final { scan-assembler "cvttpd2dq" } } */ +/* { dg-final { scan-assembler "cvtdq2ps" } } */ +/* { dg-final { scan-assembler "cvtps2pd" } } */ +/* { dg-final { scan-assembler "cvttps2dq" } } */ +/* { dg-final { scan-assembler "cvtdq2pd" } } */ +/* { dg-final { scan-assembler "cvtpd2ps" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/avx-cvt-1.c.jj 2011-10-27 13:49:36.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx-cvt-1.c 2011-10-27 14:08:49.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mavx -mno-avx2" } */ +/* { dg-require-effective-target avx_runtime } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include "sse2-cvt-1.c" --- gcc/testsuite/gcc.target/i386/avx-cvt-2.c.jj 2011-10-27 13:54:52.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx-cvt-2.c 2011-10-27 14:11:08.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx -mno-avx2 -fdump-tree-vect-details" } */ + +#include "avx-cvt-1.c" + +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 6 "vect" } } */ +/* { dg-final { scan-assembler "vcvttpd2dq(y\[^\n\r\]*%xmm|\[^\n\r\]*xmm\[^\n\r\]*YMMWORD PTR)" } } */ +/* { dg-final { scan-assembler "vcvtdq2ps\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler "vcvtps2pd\[^\n\r\]*(%xmm\[^\n\r\]*%ymm|ymm\[^\n\r\]*xmm)" } } */ +/* { dg-final { scan-assembler "vcvttps2dq\[^\n\r\]*ymm" } } */ +/* { dg-final { scan-assembler "vcvtdq2pd\[^\n\r\]*xmm\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler "vcvtpd2ps(y\[^\n\r\]*%xmm|\[^\n\r\]*xmm\[^\n\r\]*YMMWORD PTR)" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/avx2-cvt-1.c.jj 2011-10-27 13:50:58.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-cvt-1.c 2011-10-27 14:08:44.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mavx2" } */ +/* { dg-require-effective-target avx2 } */ + +#ifndef CHECK_H +#define CHECK_H "avx2-check.h" +#endif + +#ifndef TEST +#define TEST avx2_test +#endif + +#include "sse2-cvt-1.c" --- gcc/testsuite/gcc.target/i386/avx2-cvt-2.c.jj 2011-10-27 14:11:33.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-cvt-2.c 2011-10-27 14:15:10.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + +#include "avx2-cvt-1.c" + +/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 6 "vect" } } */ +/* { dg-final { scan-assembler "vcvttpd2dq(y\[^\n\r\]*%xmm|\[^\n\r\]*xmm\[^\n\r\]*YMMWORD PTR)" } } */ +/* { dg-final { scan-assembler "vcvtdq2ps\[^\n\r\]*ymm" } } */ +/* { dg-final { scan-assembler "vcvtps2pd\[^\n\r\]*(%xmm\[^\n\r\]*%ymm|ymm\[^\n\r\]*xmm)" } } */ +/* { dg-final { scan-assembler "vcvttps2dq\[^\n\r\]*ymm" } } */ +/* { dg-final { scan-assembler "vcvtdq2pd\[^\n\r\]*(%xmm\[^\n\r\]*%ymm|ymm\[^\n\r\]*xmm)" } } */ +/* { dg-final { scan-assembler "vcvtpd2ps(y\[^\n\r\]*%xmm|\[^\n\r\]*xmm\[^\n\r\]*YMMWORD PTR)" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Jakub