On Wed, Jan 13, 2021 at 8:13 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following patch implements what I've talked about, i.e. to no longer > force operands of vec_perm_const into registers in the generic code, but let > each of the (currently 8) targets force it into registers individually, > giving the targets better control on if it does that and when and allowing > them to do something special with some particular operands. > And then defines the define_insn_and_split for the 256-bit and 512-bit > permutations into vpmovzx* (only the bw, wd and dq cases, in theory we could > add define_insn_and_split patterns also for the bd, bq and wq). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2021-01-13 Jakub Jelinek <ja...@redhat.com> > > PR target/95905 > * optabs.c (expand_vec_perm_const): Don't force v0 and v1 into > registers before calling targetm.vectorize.vec_perm_const, only after > that. > * config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Handle > two argument permutation when one operand is zero vector and only > after that force operands into registers. > * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_1, > *avx512bw_zero_extendv32qiv32hi2_1, *avx512f_zero_extendv16hiv16si2_1, > *avx2_zero_extendv8hiv8si2_1, *avx512f_zero_extendv8siv8di2_1, > *avx2_zero_extendv4siv4di2_1): New define_insn_and_split patterns. > * config/mips/mips.c (mips_vectorize_vec_perm_const): Force operands > into registers. > * config/arm/arm.c (arm_vectorize_vec_perm_const): Likewise. > * config/sparc/sparc.c (sparc_vectorize_vec_perm_const): Likewise. > * config/ia64/ia64.c (ia64_vectorize_vec_perm_const): Likewise. > * config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const): > Likewise. > * config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const): Likewise. > * config/gcn/gcn.c (gcn_vectorize_vec_perm_const): Likewise. Use > std::swap. > > * gcc.target/i386/pr95905-2.c: Use scan-assembler-times instead of > scan-assembler. Add tests with zero vector as first __builtin_shuffle > operand. > * gcc.target/i386/pr95905-3.c: New test. > * gcc.target/i386/pr95905-4.c: New test.
LGTM for x86 part. Thanks, Uros. > > --- gcc/optabs.c.jj 2021-01-04 10:25:38.632236100 +0100 > +++ gcc/optabs.c 2021-01-12 14:46:44.719557815 +0100 > @@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode > > if (targetm.vectorize.vec_perm_const != NULL) > { > - v0 = force_reg (mode, v0); > if (single_arg_p) > v1 = v0; > - else > - v1 = force_reg (mode, v1); > > if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices)) > return target; > @@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode > return gen_lowpart (mode, target_qi); > } > > + v0 = force_reg (mode, v0); > + if (single_arg_p) > + v1 = v0; > + v1 = force_reg (mode, v1); > + > /* Otherwise expand as a fully variable permuation. */ > > /* The optabs are only defined for selectors with the same width > --- gcc/config/i386/i386-expand.c.jj 2021-01-12 11:01:51.189386077 +0100 > +++ gcc/config/i386/i386-expand.c 2021-01-12 15:43:55.673095807 +0100 > @@ -19929,6 +19929,33 @@ ix86_vectorize_vec_perm_const (machine_m > > two_args = canonicalize_perm (&d); > > + /* If one of the operands is a zero vector, try to match pmovzx. */ > + if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX > (vmode))) > + { > + struct expand_vec_perm_d dzero = d; > + if (d.op0 == CONST0_RTX (vmode)) > + { > + d.op1 = dzero.op1 = force_reg (vmode, d.op1); > + std::swap (dzero.op0, dzero.op1); > + for (i = 0; i < nelt; ++i) > + dzero.perm[i] ^= nelt; > + } > + else > + d.op0 = dzero.op0 = force_reg (vmode, d.op0); > + > + if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1, > + dzero.perm, nelt, dzero.testing_p)) > + return true; > + } > + > + /* Force operands into registers. */ > + rtx nop0 = force_reg (vmode, d.op0); > + if (d.op0 == d.op1) > + d.op1 = nop0; > + d.op0 = nop0; > + if (d.op0 != d.op1) > + d.op1 = force_reg (vmode, d.op1); > + > if (ix86_expand_vec_perm_const_1 (&d)) > return true; > > --- gcc/config/i386/sse.md.jj 2021-01-12 14:30:32.688546846 +0100 > +++ gcc/config/i386/sse.md 2021-01-12 15:40:29.018402527 +0100 > @@ -17611,6 +17611,23 @@ (define_insn "avx2_<code>v16qiv16hi2<mas > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "OI")]) > > +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1" > + [(set (match_operand:V32QI 0 "register_operand" "=v") > + (vec_select:V32QI > + (vec_concat:V64QI > + (match_operand:V32QI 1 "nonimmediate_operand" "vm") > + (match_operand:V32QI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX2" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode); > + operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode); > +}) > + > (define_expand "<insn>v16qiv16hi2" > [(set (match_operand:V16HI 0 "register_operand") > (any_extend:V16HI > @@ -17628,6 +17645,23 @@ (define_insn "avx512bw_<code>v32qiv32hi2 > (set_attr "prefix" "evex") > (set_attr "mode" "XI")]) > > +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1" > + [(set (match_operand:V64QI 0 "register_operand" "=v") > + (vec_select:V64QI > + (vec_concat:V128QI > + (match_operand:V64QI 1 "nonimmediate_operand" "vm") > + (match_operand:V64QI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX512BW" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode); > + operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode); > +}) > + > (define_expand "<insn>v32qiv32hi2" > [(set (match_operand:V32HI 0 "register_operand") > (any_extend:V32HI > @@ -17883,6 +17917,23 @@ (define_expand "<insn>v16hiv16si2" > (match_operand:V16HI 1 "nonimmediate_operand")))] > "TARGET_AVX512F") > > +(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1" > + [(set (match_operand:V32HI 0 "register_operand" "=v") > + (vec_select:V32HI > + (vec_concat:V64HI > + (match_operand:V32HI 1 "nonimmediate_operand" "vm") > + (match_operand:V32HI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX512F" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode); > + operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode); > +}) > + > (define_insn "avx2_<code>v8hiv8si2<mask_name>" > [(set (match_operand:V8SI 0 "register_operand" "=v") > (any_extend:V8SI > @@ -17900,6 +17951,23 @@ (define_expand "<insn>v8hiv8si2" > (match_operand:V8HI 1 "nonimmediate_operand")))] > "TARGET_AVX2") > > +(define_insn_and_split "avx2_zero_extendv8hiv8si2_1" > + [(set (match_operand:V16HI 0 "register_operand" "=v") > + (vec_select:V16HI > + (vec_concat:V32HI > + (match_operand:V16HI 1 "nonimmediate_operand" "vm") > + (match_operand:V16HI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX2" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode); > + operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode); > +}) > + > (define_insn "sse4_1_<code>v4hiv4si2<mask_name>" > [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") > (any_extend:V4SI > @@ -18275,6 +18343,23 @@ (define_insn "avx512f_<code>v8siv8di2<ma > (set_attr "prefix" "evex") > (set_attr "mode" "XI")]) > > +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1" > + [(set (match_operand:V16SI 0 "register_operand" "=v") > + (vec_select:V16SI > + (vec_concat:V32SI > + (match_operand:V16SI 1 "nonimmediate_operand" "vm") > + (match_operand:V16SI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX512F" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode); > + operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode); > +}) > + > (define_expand "<insn>v8siv8di2" > [(set (match_operand:V8DI 0 "register_operand" "=v") > (any_extend:V8DI > @@ -18292,6 +18377,23 @@ (define_insn "avx2_<code>v4siv4di2<mask_ > (set_attr "prefix_extra" "1") > (set_attr "mode" "OI")]) > > +(define_insn_and_split "*avx2_zero_extendv4siv4di2_1" > + [(set (match_operand:V8SI 0 "register_operand" "=v") > + (vec_select:V8SI > + (vec_concat:V16SI > + (match_operand:V8SI 1 "nonimmediate_operand" "vm") > + (match_operand:V8SI 2 "const0_operand" "C")) > + (match_parallel 3 "pmovzx_parallel" > + [(match_operand 4 "const_int_operand" "n")])))] > + "TARGET_AVX2" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))] > +{ > + operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode); > + operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode); > +}) > + > (define_expand "<insn>v4siv4di2" > [(set (match_operand:V4DI 0 "register_operand" "=v") > (any_extend:V4DI > --- gcc/config/mips/mips.c.jj 2021-01-04 10:25:41.592202583 +0100 > +++ gcc/config/mips/mips.c 2021-01-12 15:06:07.608535692 +0100 > @@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_m > bool ok; > > d.target = target; > + if (op0) > + { > + rtx nop0 = force_reg (vmode, op0); > + if (op0 == op1) > + op1 = nop0; > + op0 = nop0; > + } > + if (op1 && op0 != op1) > + op1 = force_reg (vmode, op1); > d.op0 = op0; > d.op1 = op1; > > --- gcc/config/arm/arm.c.jj 2021-01-04 10:25:44.469170006 +0100 > +++ gcc/config/arm/arm.c 2021-01-12 15:02:24.333038536 +0100 > @@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mo > return false; > > d.target = target; > + if (op0) > + { > + rtx nop0 = force_reg (vmode, op0); > + if (op0 == op1) > + op1 = nop0; > + op0 = nop0; > + } > + if (op1 && op0 != op1) > + op1 = force_reg (vmode, op1); > d.op0 = op0; > d.op1 = op1; > > --- gcc/config/sparc/sparc.c.jj 2021-01-04 10:25:45.662156497 +0100 > +++ gcc/config/sparc/sparc.c 2021-01-12 15:10:43.491443165 +0100 > @@ -12942,6 +12942,13 @@ sparc_vectorize_vec_perm_const (machine_ > if (vmode != V8QImode) > return false; > > + rtx nop0 = force_reg (vmode, op0); > + if (op0 == op1) > + op1 = nop0; > + op0 = nop0; > + if (op0 != op1) > + op1 = force_reg (vmode, op1); > + > unsigned int i, mask; > for (i = mask = 0; i < 8; ++i) > mask |= (sel[i] & 0xf) << (28 - i*4); > --- gcc/config/ia64/ia64.c.jj 2021-01-04 10:25:45.808154844 +0100 > +++ gcc/config/ia64/ia64.c 2021-01-12 15:03:26.704339360 +0100 > @@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_m > unsigned int i, nelt, which; > > d.target = target; > + if (op0) > + { > + rtx nop0 = force_reg (vmode, op0); > + if (op0 == op1) > + op1 = nop0; > + op0 = nop0; > + } > + if (op1 && op0 != op1) > + op1 = force_reg (vmode, op1); > d.op0 = op0; > d.op1 = op1; > > --- gcc/config/aarch64/aarch64.c.jj 2021-01-05 13:53:53.291683826 +0100 > +++ gcc/config/aarch64/aarch64.c 2021-01-12 14:51:26.645401653 +0100 > @@ -21020,8 +21020,11 @@ aarch64_vectorize_vec_perm_const (machin > d.vmode = vmode; > d.vec_flags = aarch64_classify_vector_mode (d.vmode); > d.target = target; > - d.op0 = op0; > - d.op1 = op1; > + d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX; > + if (op0 == op1) > + d.op1 = d.op0; > + else > + d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX; > d.testing_p = !target; > > if (!d.testing_p) > --- gcc/config/rs6000/rs6000.c.jj 2021-01-04 10:25:47.037140928 +0100 > +++ gcc/config/rs6000/rs6000.c 2021-01-12 15:09:32.866234841 +0100 > @@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine > if (TARGET_ALTIVEC && testing_p) > return true; > > + if (op0) > + { > + rtx nop0 = force_reg (vmode, op0); > + if (op0 == op1) > + op1 = nop0; > + op0 = nop0; > + } > + if (op1 && op0 != op1) > + op1 = force_reg (vmode, op1); > + > /* Check for ps_merge* or xxpermdi insns. */ > if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode)) > { > --- gcc/config/gcn/gcn.c.jj 2021-01-04 10:25:45.939153361 +0100 > +++ gcc/config/gcn/gcn.c 2021-01-12 14:56:17.394146737 +0100 > @@ -3986,13 +3986,14 @@ gcn_vectorize_vec_perm_const (machine_mo > for (unsigned int i = 0; i < nelt; ++i) > perm[i] = sel[i] & (2 * nelt - 1); > > + src0 = force_reg (vmode, src0); > + src1 = force_reg (vmode, src1); > + > /* Make life a bit easier by swapping operands if necessary so that > the first element always comes from src0. */ > if (perm[0] >= nelt) > { > - rtx temp = src0; > - src0 = src1; > - src1 = temp; > + std::swap (src0, src1); > > for (unsigned int i = 0; i < nelt; ++i) > if (perm[i] < nelt) > --- gcc/testsuite/gcc.target/i386/pr95905-2.c.jj 2021-01-12 > 13:58:39.820222075 +0100 > +++ gcc/testsuite/gcc.target/i386/pr95905-2.c 2021-01-12 15:50:05.796964412 > +0100 > @@ -1,9 +1,9 @@ > /* PR target/95905 */ > /* { dg-do compile } */ > /* { dg-options "-O2 -msse4.1" } */ > -/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */ > -/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */ > -/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */ > +/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */ > > typedef unsigned char V1 __attribute__((vector_size (16))); > typedef unsigned short V2 __attribute__((vector_size (16))); > @@ -44,3 +44,39 @@ f6 (V3 *x) > { > return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 }); > } > + > +V1 > +f7 (V1 x) > +{ > + return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > + > +V2 > +f8 (V2 x) > +{ > + return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 }); > +} > + > +V3 > +f9 (V3 x) > +{ > + return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 }); > +} > + > +V1 > +f10 (V1 *x) > +{ > + return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > + > +V2 > +f11 (V2 *x) > +{ > + return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 }); > +} > + > +V3 > +f12 (V3 *x) > +{ > + return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 }); > +} > --- gcc/testsuite/gcc.target/i386/pr95905-3.c.jj 2021-01-12 > 15:53:05.627957108 +0100 > +++ gcc/testsuite/gcc.target/i386/pr95905-3.c 2021-01-12 15:52:32.393328070 > +0100 > @@ -0,0 +1,82 @@ > +/* PR target/95905 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2" } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */ > + > +typedef unsigned char V1 __attribute__((vector_size (32))); > +typedef unsigned short V2 __attribute__((vector_size (32))); > +typedef unsigned int V3 __attribute__((vector_size (32))); > + > +V1 > +f1 (V1 x) > +{ > + return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, > 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, > 46, 15, 47 }); > +} > + > +V2 > +f2 (V2 x) > +{ > + return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, > 4, 20, 5, 21, 6, 22, 7, 23 }); > +} > + > +V3 > +f3 (V3 x) > +{ > + return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 }); > +} > + > +V1 > +f4 (V1 *x) > +{ > + return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, > 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, > 46, 15, 47 }); > +} > + > +V2 > +f5 (V2 *x) > +{ > + return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, > 4, 20, 5, 21, 6, 22, 7, 23 }); > +} > + > +V3 > +f6 (V3 *x) > +{ > + return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 }); > +} > + > +V1 > +f7 (V1 x) > +{ > + return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, > 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, > 14, 47, 15 }); > +} > + > +V2 > +f8 (V2 x) > +{ > + return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > + > +V3 > +f9 (V3 x) > +{ > + return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 }); > +} > + > +V1 > +f10 (V1 *x) > +{ > + return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, > 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, > 14, 47, 15 }); > +} > + > +V2 > +f11 (V2 *x) > +{ > + return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > + > +V3 > +f12 (V3 *x) > +{ > + return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 }); > +} > --- gcc/testsuite/gcc.target/i386/pr95905-4.c.jj 2021-01-12 > 15:55:30.065343628 +0100 > +++ gcc/testsuite/gcc.target/i386/pr95905-4.c 2021-01-12 15:55:01.957657667 > +0100 > @@ -0,0 +1,82 @@ > +/* PR target/95905 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512bw" } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */ > + > +typedef unsigned char V1 __attribute__((vector_size (64))); > +typedef unsigned short V2 __attribute__((vector_size (64))); > +typedef unsigned int V3 __attribute__((vector_size (64))); > + > +V1 > +f1 (V1 x) > +{ > + return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, > 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, > 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, > 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 }); > +} > + > +V2 > +f2 (V2 x) > +{ > + return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, > 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, > 46, 15, 47 }); > +} > + > +V3 > +f3 (V3 x) > +{ > + return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, > 4, 20, 5, 21, 6, 22, 7, 23 }); > +} > + > +V1 > +f4 (V1 *x) > +{ > + return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, > 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, > 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, > 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 }); > +} > + > +V2 > +f5 (V2 *x) > +{ > + return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, > 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, > 46, 15, 47 }); > +} > + > +V3 > +f6 (V3 *x) > +{ > + return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, > 4, 20, 5, 21, 6, 22, 7, 23 }); > +} > + > +V1 > +f7 (V1 x) > +{ > + return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, > 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, > 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, > 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 }); > +} > + > +V2 > +f8 (V2 x) > +{ > + return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, > 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, > 14, 47, 15 }); > +} > + > +V3 > +f9 (V3 x) > +{ > + return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > + > +V1 > +f10 (V1 *x) > +{ > + return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, > 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, > 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, > 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 }); > +} > + > +V2 > +f11 (V2 *x) > +{ > + return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, > 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, > 14, 47, 15 }); > +} > + > +V3 > +f12 (V3 *x) > +{ > + return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, > 20, 4, 21, 5, 22, 6, 23, 7 }); > +} > > Jakub >