--- Comment #4 from John Platts <john_platts at hotmail dot com> ---
Here is another test program that exposes the optimization bug with applying
the vec_sl operation to a constant vector (which generates incorrect results on
both big-endian and little-endian POWER10 when compiled with the -O2
-mcpu=power10 options with GCC 12.1.0):
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <altivec.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#include <stdint.h>
#include <stddef.h>
#include <limits>
#include <type_traits>

template<size_t LaneSize>
struct AltivecTypes {

struct AltivecTypes<1> {
  using UnsignedLaneT = unsigned char;
  using SignedLaneT = signed char;
  using UnsignedVectT = __vector unsigned char;
  using SignedVectT = __vector signed char;
  using BoolVectT = __vector __bool char;

struct AltivecTypes<2> {
  using UnsignedLaneT = unsigned short;
  using SignedLaneT = signed short;
  using UnsignedVectT = __vector unsigned short;
  using SignedVectT = __vector signed short;
  using BoolVectT = __vector __bool short;

struct AltivecTypes<4> {
  using UnsignedLaneT = unsigned int;
  using SignedLaneT = signed int;
  using FloatLaneT = float;
  using UnsignedVectT = __vector unsigned int;
  using SignedVectT = __vector signed int;
  using BoolVectT = __vector __bool int;
  using FloatVectT = __vector float;

struct AltivecTypes<8> {
  using UnsignedLaneT = unsigned long long;
  using SignedLaneT = signed long long;
  using FloatLaneT = double;
  using UnsignedVectT = __vector unsigned long long;
  using SignedVectT = __vector signed long long;
  using BoolVectT = __vector __bool long long;
  using FloatVectT = __vector double;

template<class T, bool = std::is_signed_v<T>, bool = std::is_integral_v<T>,
                  bool = std::is_floating_point_v<T>, class = void>
struct MakeAltivecVectorType {

template<class T>
struct MakeAltivecVectorType<T, true, true, false,
  std::void_t<typename AltivecTypes<sizeof(T)>::SignedVectT>> {
  using type = typename AltivecTypes<sizeof(T)>::SignedVectT;

template<class T>
struct MakeAltivecVectorType<T, false, true, false,
  std::void_t<typename AltivecTypes<sizeof(T)>::UnsignedVectT>> {
  using type = typename AltivecTypes<sizeof(T)>::UnsignedVectT;

template<class T>
struct MakeAltivecVectorType<T, true, false, true,
  std::void_t<typename AltivecTypes<sizeof(T)>::FloatVectT>> {
  using type = typename AltivecTypes<sizeof(T)>::FloatVectT;

template<class T>
using AltivecVectorType = typename MakeAltivecVectorType<T>::type;

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr>
AltivecVectorType<T> LoadVector(const T* __restrict__ src) {
  using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT;
  Bits bits;
  __builtin_memcpy(&bits, src, sizeof(T) * N);
  return reinterpret_cast<AltivecVectorType<T>>(vec_splats(bits));

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr>
AltivecVectorType<T> LoadVector(const T* __restrict__ src) {
  using LaneT =
  typedef LaneT LoadRawT
    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
  const LoadRawT* __restrict__ p = reinterpret_cast<const LoadRawT*>(src);
  return reinterpret_cast<AltivecVectorType<T>>(*p);

template<class T, size_t N, class T2>
AltivecVectorType<T> Iota(const T2 first) {
    using TU = std::make_unsigned_t<T>;

    alignas(16) T lanes[N];
    for(size_t i = 0; i < N; i++) {
        lanes[i] = static_cast<T>(
            (static_cast<TU>(i) + static_cast<TU>(first)) &

    return LoadVector<N>(lanes);

template<class T>
AltivecVectorType<T> LoadTestVectToShift() {
    return Iota<T, 1>(uint32_t{0xFA578D00u});

template<class FromV>
AltivecVectorType<uint8_t> DoVectorShiftToU8(FromV vect) {
  using FromLaneT = std::decay_t<decltype(std::declval<FromV>()[0])>;
  constexpr size_t sizeOfFromLane = sizeof(FromLaneT);
  static_assert(sizeOfFromLane >= 2, "sizeOfFromLane >= 2 must be true");

  return reinterpret_cast<__vector unsigned char>(
    vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char)));

__vector unsigned char U64ShiftedVect() {
    __vector unsigned char v =
    return v;

__vector unsigned char U32ShiftedVect() {
    __vector unsigned char v =
    return v;

__vector unsigned char U16ShiftedVect() {
    __vector unsigned char v =
    return v;

__vector unsigned char U64ShiftedVect_2() {
    __vector unsigned long long u64_v = LoadTestVectToShift<uint64_t>();
            : "+wa" (u64_v));

    return DoVectorShiftToU8(u64_v);

__vector unsigned char U32ShiftedVect_2() {
    __vector unsigned int u32_v = LoadTestVectToShift<uint32_t>();
            : "+wa" (u32_v));

    return DoVectorShiftToU8(u32_v);

__vector unsigned char U16ShiftedVect_2() {
    __vector unsigned short u16_v = LoadTestVectToShift<uint16_t>();
            : "+wa" (u16_v));

    return DoVectorShiftToU8(u16_v);

Here is the assembly code that is generated when the above code is compiled
with the -O2 -mcpu=power10 options on the powerpc64-linux-gnu target:
        .file   "vsx_power10_shift_test_snippet_030923.cpp"
        .machine power10
        .section        ".text"
        .align 2
        .p2align 4,,15
        .globl _Z14U64ShiftedVectv
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z14U64ShiftedVectv,.TOC.@tocbase,0
        .type   _Z14U64ShiftedVectv, @function
        plxv 34,.LC0@pcrel
        vsldoi 2,2,2,7
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U64ShiftedVectv,.-.L._Z14U64ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z14U32ShiftedVectv
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z14U32ShiftedVectv,.TOC.@tocbase,0
        .type   _Z14U32ShiftedVectv, @function
        xxspltiw 34,4200041728
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U32ShiftedVectv,.-.L._Z14U32ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z14U16ShiftedVectv
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z14U16ShiftedVectv,.TOC.@tocbase,0
        .type   _Z14U16ShiftedVectv, @function
        xxspltiw 34,2365623552
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U16ShiftedVectv,.-.L._Z14U16ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z16U64ShiftedVect_2v
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z16U64ShiftedVect_2v,.TOC.@tocbase,0
        .type   _Z16U64ShiftedVect_2v, @function
        plxv 34,.LC0@pcrel
        vsldoi 2,2,2,7
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U64ShiftedVect_2v,.-.L._Z16U64ShiftedVect_2v
        .align 2
        .p2align 4,,15
        .globl _Z16U32ShiftedVect_2v
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z16U32ShiftedVect_2v,.TOC.@tocbase,0
        .type   _Z16U32ShiftedVect_2v, @function
        xxspltiw 34,4200041728
        vsldoi 2,2,2,3
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U32ShiftedVect_2v,.-.L._Z16U32ShiftedVect_2v
        .align 2
        .p2align 4,,15
        .globl _Z16U16ShiftedVect_2v
        .section        ".opd","aw"
        .align 3
        .quad   .L._Z16U16ShiftedVect_2v,.TOC.@tocbase,0
        .type   _Z16U16ShiftedVect_2v, @function
        xxspltiw 34,2365623552
        vsldoi 2,2,2,1
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U16ShiftedVect_2v,.-.L._Z16U16ShiftedVect_2v
        .section        .rodata.cst16,"aM",@progbits,16
        .align 4
        .quad   4200041728
        .quad   4200041728
        .ident  "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0"

Here is the assembly code that is generated when the above code is compiled
with the -O2 -mcpu=power10 options on the powerpc64le-linux-gnu target:
        .file   "vsx_power10_shift_test_snippet_030923.cpp"
        .machine power10
        .abiversion 2
        .section        ".text"
        .align 2
        .p2align 4,,15
        .globl _Z14U64ShiftedVectv
        .type   _Z14U64ShiftedVectv, @function
        .localentry     _Z14U64ShiftedVectv,1
        plxv 34,.LC0@pcrel
        vsldoi 2,2,2,7
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U64ShiftedVectv,.-_Z14U64ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z14U32ShiftedVectv
        .type   _Z14U32ShiftedVectv, @function
        .localentry     _Z14U32ShiftedVectv,1
        xxspltiw 34,4200041728
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U32ShiftedVectv,.-_Z14U32ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z14U16ShiftedVectv
        .type   _Z14U16ShiftedVectv, @function
        .localentry     _Z14U16ShiftedVectv,1
        xxspltiw 34,2365623552
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z14U16ShiftedVectv,.-_Z14U16ShiftedVectv
        .align 2
        .p2align 4,,15
        .globl _Z16U64ShiftedVect_2v
        .type   _Z16U64ShiftedVect_2v, @function
        .localentry     _Z16U64ShiftedVect_2v,1
        plxv 34,.LC0@pcrel
        vsldoi 2,2,2,7
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U64ShiftedVect_2v,.-_Z16U64ShiftedVect_2v
        .align 2
        .p2align 4,,15
        .globl _Z16U32ShiftedVect_2v
        .type   _Z16U32ShiftedVect_2v, @function
        .localentry     _Z16U32ShiftedVect_2v,1
        xxspltiw 34,4200041728
        vsldoi 2,2,2,3
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U32ShiftedVect_2v,.-_Z16U32ShiftedVect_2v
        .align 2
        .p2align 4,,15
        .globl _Z16U16ShiftedVect_2v
        .type   _Z16U16ShiftedVect_2v, @function
        .localentry     _Z16U16ShiftedVect_2v,1
        xxspltiw 34,2365623552
        vsldoi 2,2,2,1
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .size   _Z16U16ShiftedVect_2v,.-_Z16U16ShiftedVect_2v
        .section        .rodata.cst16,"aM",@progbits,16
        .align 4
        .quad   4200041728
        .quad   4200041728
        .ident  "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0"
        .section        .note.GNU-stack,"",@progbits

Reply via email to