https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069
--- Comment #4 from John Platts <john_platts at hotmail dot com> --- Here is another test program that exposes the optimization bug with applying the vec_sl operation to a constant vector (which generates incorrect results on both big-endian and little-endian POWER10 when compiled with the -O2 -mcpu=power10 options with GCC 12.1.0): #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include <altivec.h> #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include <stdint.h> #include <stddef.h> #include <limits> #include <type_traits> template<size_t LaneSize> struct AltivecTypes { }; template<> struct AltivecTypes<1> { using UnsignedLaneT = unsigned char; using SignedLaneT = signed char; using UnsignedVectT = __vector unsigned char; using SignedVectT = __vector signed char; using BoolVectT = __vector __bool char; }; template<> struct AltivecTypes<2> { using UnsignedLaneT = unsigned short; using SignedLaneT = signed short; using UnsignedVectT = __vector unsigned short; using SignedVectT = __vector signed short; using BoolVectT = __vector __bool short; }; template<> struct AltivecTypes<4> { using UnsignedLaneT = unsigned int; using SignedLaneT = signed int; using FloatLaneT = float; using UnsignedVectT = __vector unsigned int; using SignedVectT = __vector signed int; using BoolVectT = __vector __bool int; using FloatVectT = __vector float; }; template<> struct AltivecTypes<8> { using UnsignedLaneT = unsigned long long; using SignedLaneT = signed long long; using FloatLaneT = double; using UnsignedVectT = __vector unsigned long long; using SignedVectT = __vector signed long long; using BoolVectT = __vector __bool long long; using FloatVectT = __vector double; }; template<class T, bool = std::is_signed_v<T>, bool = std::is_integral_v<T>, bool = std::is_floating_point_v<T>, class = void> struct MakeAltivecVectorType { }; template<class T> struct MakeAltivecVectorType<T, true, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::SignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::SignedVectT; }; template<class T> struct MakeAltivecVectorType<T, false, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::UnsignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::UnsignedVectT; }; template<class T> struct MakeAltivecVectorType<T, true, false, true, std::void_t<typename AltivecTypes<sizeof(T)>::FloatVectT>> { using type = typename AltivecTypes<sizeof(T)>::FloatVectT; }; template<class T> using AltivecVectorType = typename MakeAltivecVectorType<T>::type; template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; Bits bits; __builtin_memcpy(&bits, src, sizeof(T) * N); return reinterpret_cast<AltivecVectorType<T>>(vec_splats(bits)); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using LaneT = std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>; typedef LaneT LoadRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); const LoadRawT* __restrict__ p = reinterpret_cast<const LoadRawT*>(src); return reinterpret_cast<AltivecVectorType<T>>(*p); } template<class T, size_t N, class T2> AltivecVectorType<T> Iota(const T2 first) { using TU = std::make_unsigned_t<T>; alignas(16) T lanes[N]; for(size_t i = 0; i < N; i++) { lanes[i] = static_cast<T>( (static_cast<TU>(i) + static_cast<TU>(first)) & std::numeric_limits<TU>::max()); } return LoadVector<N>(lanes); } template<class T> AltivecVectorType<T> LoadTestVectToShift() { return Iota<T, 1>(uint32_t{0xFA578D00u}); } template<class FromV> AltivecVectorType<uint8_t> DoVectorShiftToU8(FromV vect) { using FromLaneT = std::decay_t<decltype(std::declval<FromV>()[0])>; constexpr size_t sizeOfFromLane = sizeof(FromLaneT); static_assert(sizeOfFromLane >= 2, "sizeOfFromLane >= 2 must be true"); return reinterpret_cast<__vector unsigned char>( vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char))); } __vector unsigned char U64ShiftedVect() { __vector unsigned char v = DoVectorShiftToU8(LoadTestVectToShift<uint64_t>()); return v; } __vector unsigned char U32ShiftedVect() { __vector unsigned char v = DoVectorShiftToU8(LoadTestVectToShift<uint32_t>()); return v; } __vector unsigned char U16ShiftedVect() { __vector unsigned char v = DoVectorShiftToU8(LoadTestVectToShift<uint16_t>()); return v; } __vector unsigned char U64ShiftedVect_2() { __vector unsigned long long u64_v = LoadTestVectToShift<uint64_t>(); __asm__("" : "+wa" (u64_v)); return DoVectorShiftToU8(u64_v); } __vector unsigned char U32ShiftedVect_2() { __vector unsigned int u32_v = LoadTestVectToShift<uint32_t>(); __asm__("" : "+wa" (u32_v)); return DoVectorShiftToU8(u32_v); } __vector unsigned char U16ShiftedVect_2() { __vector unsigned short u16_v = LoadTestVectToShift<uint16_t>(); __asm__("" : "+wa" (u16_v)); return DoVectorShiftToU8(u16_v); } Here is the assembly code that is generated when the above code is compiled with the -O2 -mcpu=power10 options on the powerpc64-linux-gnu target: .file "vsx_power10_shift_test_snippet_030923.cpp" .machine power10 .section ".text" .align 2 .p2align 4,,15 .globl _Z14U64ShiftedVectv .section ".opd","aw" .align 3 _Z14U64ShiftedVectv: .quad .L._Z14U64ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U64ShiftedVectv, @function .L._Z14U64ShiftedVectv: .LFB206: .cfi_startproc plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE206: .size _Z14U64ShiftedVectv,.-.L._Z14U64ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U32ShiftedVectv .section ".opd","aw" .align 3 _Z14U32ShiftedVectv: .quad .L._Z14U32ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U32ShiftedVectv, @function .L._Z14U32ShiftedVectv: .LFB207: .cfi_startproc xxspltiw 34,4200041728 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE207: .size _Z14U32ShiftedVectv,.-.L._Z14U32ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U16ShiftedVectv .section ".opd","aw" .align 3 _Z14U16ShiftedVectv: .quad .L._Z14U16ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U16ShiftedVectv, @function .L._Z14U16ShiftedVectv: .LFB208: .cfi_startproc xxspltiw 34,2365623552 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE208: .size _Z14U16ShiftedVectv,.-.L._Z14U16ShiftedVectv .align 2 .p2align 4,,15 .globl _Z16U64ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U64ShiftedVect_2v: .quad .L._Z16U64ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U64ShiftedVect_2v, @function .L._Z16U64ShiftedVect_2v: .LFB209: .cfi_startproc plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE209: .size _Z16U64ShiftedVect_2v,.-.L._Z16U64ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U32ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U32ShiftedVect_2v: .quad .L._Z16U32ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U32ShiftedVect_2v, @function .L._Z16U32ShiftedVect_2v: .LFB210: .cfi_startproc xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE210: .size _Z16U32ShiftedVect_2v,.-.L._Z16U32ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U16ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U16ShiftedVect_2v: .quad .L._Z16U16ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U16ShiftedVect_2v, @function .L._Z16U16ShiftedVect_2v: .LFB211: .cfi_startproc xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE211: .size _Z16U16ShiftedVect_2v,.-.L._Z16U16ShiftedVect_2v .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC0: .quad 4200041728 .quad 4200041728 .ident "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0" Here is the assembly code that is generated when the above code is compiled with the -O2 -mcpu=power10 options on the powerpc64le-linux-gnu target: .file "vsx_power10_shift_test_snippet_030923.cpp" .machine power10 .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl _Z14U64ShiftedVectv .type _Z14U64ShiftedVectv, @function _Z14U64ShiftedVectv: .LFB206: .cfi_startproc .localentry _Z14U64ShiftedVectv,1 plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE206: .size _Z14U64ShiftedVectv,.-_Z14U64ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U32ShiftedVectv .type _Z14U32ShiftedVectv, @function _Z14U32ShiftedVectv: .LFB207: .cfi_startproc .localentry _Z14U32ShiftedVectv,1 xxspltiw 34,4200041728 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE207: .size _Z14U32ShiftedVectv,.-_Z14U32ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U16ShiftedVectv .type _Z14U16ShiftedVectv, @function _Z14U16ShiftedVectv: .LFB208: .cfi_startproc .localentry _Z14U16ShiftedVectv,1 xxspltiw 34,2365623552 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE208: .size _Z14U16ShiftedVectv,.-_Z14U16ShiftedVectv .align 2 .p2align 4,,15 .globl _Z16U64ShiftedVect_2v .type _Z16U64ShiftedVect_2v, @function _Z16U64ShiftedVect_2v: .LFB209: .cfi_startproc .localentry _Z16U64ShiftedVect_2v,1 plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE209: .size _Z16U64ShiftedVect_2v,.-_Z16U64ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U32ShiftedVect_2v .type _Z16U32ShiftedVect_2v, @function _Z16U32ShiftedVect_2v: .LFB210: .cfi_startproc .localentry _Z16U32ShiftedVect_2v,1 xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE210: .size _Z16U32ShiftedVect_2v,.-_Z16U32ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U16ShiftedVect_2v .type _Z16U16ShiftedVect_2v, @function _Z16U16ShiftedVect_2v: .LFB211: .cfi_startproc .localentry _Z16U16ShiftedVect_2v,1 xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE211: .size _Z16U16ShiftedVect_2v,.-_Z16U16ShiftedVect_2v .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC0: .quad 4200041728 .quad 4200041728 .ident "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0" .section .note.GNU-stack,"",@progbits