On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, > rtx mask, rtx op1) > enum machine_mode mode = GET_MODE (op0); > switch (mode) > { > + /* There is no byte version of vpermi2. So we use vpermi2w. */ > + case V64QImode: > + if (!TARGET_AVX512BW) > + return false; > + rtx mask_lowpart, op0_lowpart, op1_lowpart; > + rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi; > + > + mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask)); > + op0_lowpart = gen_lowpart (V32HImode, op0); > + op1_lowpart = gen_lowpart (V32HImode, op1); > + tmp = gen_reg_rtx (V32HImode); > + tmp2 = gen_reg_rtx (V32HImode); > + perm_lo = gen_reg_rtx (V32HImode); > + perm_hi = gen_reg_rtx (V32HImode); > + res_lo = gen_reg_rtx (V32HImode); > + res_hi = gen_reg_rtx (V32HImode); > + > + emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8))); > + emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9))); > + emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9))); > + emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart, > + perm_lo, op1_lowpart)); > + emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart, > + perm_hi, op1_lowpart)); > + emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8))); > + emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, > res_lo), > + gen_lowpart (V64QImode, res_hi), > + force_reg (DImode, GEN_INT > (0xAAAAAAAAAAAAAAAALL)))); > + return true;
I believe this case doesn't belong to this function, other than this case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and so it should always do that, and there should be a separate function that expands the worst case of V64QImode full 2 operand permutation. See my previous mail, IMHO it is doable with 5 instructions rather than 7. And IMHO we should have a separate function which emits that, supposedly one for the constant permutations, one for the variable case (perhaps then your 7 insn sequence is best?). Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers, supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes right now plus D, either D would be NULL (then it would behave as now), or SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed. I.e. the function would start with a switch that would just contain the if (...) return false; hunks plus break; for the success case, then code to generate CONST_VECTOR if sel is NULL_RTX from d, and finally another switch with just the emit cases. Or, the first switch could just set a function pointer before break, and just use one common emit_insn (gen (target, op0, force_reg (vmode, mask), op1)); > + case V8HImode: > + if (!TARGET_AVX512VL) > + return false; > + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, > + force_reg (V8HImode, mask), > op1)); > + return true; > + case V16HImode: > + if (!TARGET_AVX512VL) > + return false; > + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, > + force_reg (V16HImode, mask), op1)); > + return true; Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW? I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I think neither of these depends on the other one in GCC. That's unlike insns where CPUID AVX512VL and AVX512F are mentioned together, because in GCC AVX512VL depends on AVX512F. > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) > > if (d->one_operand_p) > return false; > - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) > + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && > + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) Formatting, && belongs on the second line. > + ; > + else if (TARGET_AVX512VL) I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here. AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones, and the == 32 and == 16 cases are handled because AVX512VL implies TARGET_AVX2 and TARGET_SSE4_1, doesn't it? > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) > return false; > } > } > + else if (GET_MODE_SIZE (d->vmode) == 64) > + { > + if (!TARGET_AVX512BW) > + return false; > + if (vmode == V64QImode) > + { > + for (i = 0; i < nelt; ++i) > + if ((d->perm[i] ^ i) & (nelt / 4)) > + return false; Missing comment, I'd duplicate the /* vpshufb only works intra lanes, it is not possible to shuffle bytes in between the lanes. */ comment there. > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > rtx (*gen) (rtx, rtx) = NULL; > switch (d->vmode) > { > + case V64QImode: > + if (TARGET_AVX512VL) VL? Isn't that BW? > + gen = gen_avx512bw_vec_dupv64qi; > + break; > case V32QImode: > gen = gen_avx2_pbroadcastv32qi_1; > break; > + case V32HImode: > + if (TARGET_AVX512VL) Ditto. > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > mode = V8DImode; > else if (mode == V16SFmode) > mode = V16SImode; > + else if (mode == V4DFmode) > + mode = V4DImode; > + else if (mode == V2DFmode) > + mode = V2DImode; > + else if (mode == V8SFmode) > + mode = V8SImode; > + else if (mode == V4SFmode) > + mode = V4SImode; > for (i = 0; i < nelt; ++i) > vec[i] = GEN_INT (d->perm[i]); > rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); See above comment about CONST_VECTOR. > @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > return true; > > /* Try sequences of two instructions. */ > + /* ix86_expand_vec_perm_vpermi2 is also called from > + * ix86_expand_vec_perm. So it doesn't take d as parameter. > + * Construct needed params. */ > + rtx vec[64]; > + int i; > + for (i = 0; i < d->nelt; ++i) > + vec[i] = GEN_INT (d->perm[i]); > + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec)); > + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1)) > + return true; > > if (expand_vec_perm_pshuflw_pshufhw (d)) > return true; I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate (except for the V64QI case discussed above) a single insn? Then expand_vec_perm_1 should have handled that already, so this is just a waste of resources here. > @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode > vmode, > /* Given sufficient ISA support we can just return true here > for selected vector modes. */ > if (d.vmode == V16SImode || d.vmode == V16SFmode > - || d.vmode == V8DFmode || d.vmode == V8DImode) > + || d.vmode == V8DFmode || d.vmode == V8DImode > + || d.vmode == V32HImode || d.vmode == V64QImode) > /* All implementable with a single vpermi2 insn. */ > return true; 1) Shouldn't this be guarded with TARGET_AVX512F && and in the V32HImode/V64QImode also with TARGET_AVX512BW? The comment is not correct for V64QImode. 2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes. Perhaps it would be best to turn this into switch (d.vmode) { case V16SImode: case V16SFmode: case V8DFmode: case V8DImode: if (TARGET_AVX512F) /* All implementable with a single vpermi2 insn. */ return true; break; case V32HImode: if (TARGET_AVX512BW) /* Implementable with a single vpermi2 insn. */ return true; break; case V64HImode: if (TARGET_AVX512BW) /* Implementable with 2 vpermi2w, 2 vpshufb and one vpor insns. */ return true; break; case V8SImode: case V8SFmode: case V4DFmode: case V4DImode: if (TARGET_AVX512VL) /* Implementable with a single vpermi2 insn. */ return true; break; case V16HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) /* Implementable with a single vpermi2 insn. */ return true; if (TARGET_AVX2) /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ return true; break; case V32QImode: if (TARGET_AVX2) /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ return true; break; case V4SImode: case V4SFmode: case V8HImode: case V16QImode: /* All implementable with a single vpperm insn. */ if (TARGET_XOP) return true; /* All implementable with 2 pshufb + 1 ior. */ if (TARGET_SSSE3) return true; break; case V2DImode: case V2DFmode: /* All implementable with shufpd or unpck[lh]pd. */ return true; } Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2) any expanders that guarantee we generate some sequence for all possible 2 operand constant permutations. I think ix86_expand_vec_perm is able to emit the non-constant permutations for all of these, so in theory we should have an upper bound for all these. Jakub