https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101846

            Bug ID: 101846
           Summary: Improve __builtin_shufflevector emitted code
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jakub at gcc dot gnu.org
  Target Milestone: ---

typedef short v16hi __attribute__((vector_size (32)));
typedef short v32hi __attribute__((vector_size (64)));

v32hi
foo (v16hi x)
{
  return __builtin_shufflevector (x, (v16hi) {}, 0, 16, 1, 17, 2, 18, 3, 19, 4,
20, 5, 21, 6, 22, 7, 23,
                                                 8, 24, 9, 25, 10, 26, 11, 27,
12, 28, 13, 29, 14, 30, 15, 31);
}

v16hi
bar (v32hi x)
{
  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
22, 24, 26, 28, 30);
}

shows two cases, where we should be emitting just
vpmovzxwd       %ymm0, %zmm0
and
vpmovdw %zmm0, %ymm0
but we actually emit
        vmovdqa %ymm0, %ymm0
        vpmovzxwd       %ymm0, %zmm0
where the vmovdqa is unnecessary - the permutation doesn't care about the
elements at or above 32-bytes - and
        vmovdqa64       %zmm0, %zmm1
        vmovdqa64       .LC0(%rip), %zmm0
        vpermi2w        %zmm1, %zmm1, %zmm0
Similarly for permutations matching other vpmovxz* or vpmov* instructions.

Reply via email to