https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069
Bug ID: 109069 Summary: Vector truncation test program produces incorrect result on big-endian powerpc64-linux-gnu with -mcpu=power10 -O2 Product: gcc Version: 12.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: john_platts at hotmail dot com Target Milestone: --- The following C++ test program generates a test failure when compiled for big-endian powerpc64-linux-gnu with GCC 12.2.1 with the -mcpu=power10 -O2 options: #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include <altivec.h> #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include <stddef.h> #include <stdint.h> #include <stdlib.h> #include <iostream> #include <string_view> #include <type_traits> template<size_t LaneSize> struct AltivecTypes { }; template<> struct AltivecTypes<1> { using UnsignedLaneT = unsigned char; using SignedLaneT = signed char; using UnsignedVectT = __vector unsigned char; using SignedVectT = __vector signed char; using BoolVectT = __vector __bool char; }; template<> struct AltivecTypes<2> { using UnsignedLaneT = unsigned short; using SignedLaneT = signed short; using UnsignedVectT = __vector unsigned short; using SignedVectT = __vector signed short; using BoolVectT = __vector __bool short; }; template<> struct AltivecTypes<4> { using UnsignedLaneT = unsigned int; using SignedLaneT = signed int; using FloatLaneT = float; using UnsignedVectT = __vector unsigned int; using SignedVectT = __vector signed int; using BoolVectT = __vector __bool int; using FloatVectT = __vector float; }; template<> struct AltivecTypes<8> { using UnsignedLaneT = unsigned long long; using SignedLaneT = signed long long; using FloatLaneT = double; using UnsignedVectT = __vector unsigned long long; using SignedVectT = __vector signed long long; using BoolVectT = __vector __bool long long; using FloatVectT = __vector double; }; template<class T, bool = std::is_signed_v<T>, bool = std::is_integral_v<T>, bool = std::is_floating_point_v<T>, class = void> struct MakeAltivecVectorType { }; template<class T> struct MakeAltivecVectorType<T, true, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::SignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::SignedVectT; }; template<class T> struct MakeAltivecVectorType<T, false, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::UnsignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::UnsignedVectT; }; template<class T> struct MakeAltivecVectorType<T, true, false, true, std::void_t<typename AltivecTypes<sizeof(T)>::FloatVectT>> { using type = typename AltivecTypes<sizeof(T)>::FloatVectT; }; template<class T> using AltivecVectorType = typename MakeAltivecVectorType<T>::type; template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; Bits bits; __builtin_memcpy(&bits, src, sizeof(T) * N); return reinterpret_cast<AltivecVectorType<T>>(vec_splats(bits)); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using LaneT = std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>; typedef LaneT LoadRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); const LoadRawT* __restrict__ p = reinterpret_cast<const LoadRawT*>(src); return reinterpret_cast<AltivecVectorType<T>>(*p); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) { using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; typedef Bits BitsVectT __attribute__((__vector_size__(16))); const Bits bits = reinterpret_cast<BitsVectT>(vect)[0]; __builtin_memcpy(dest, &bits, sizeof(T) * N); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) { using LaneT = std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>; typedef LaneT StoreRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); StoreRawT* __restrict__ p = reinterpret_cast<StoreRawT*>(dest); *p = reinterpret_cast<StoreRawT>(vect); } template<size_t N, class FromV> AltivecVectorType<uint8_t> AltivecTruncateToU8(FromV vect) { static_assert(N >= 1, "N >= 1 must be true"); using FromLaneT = std::decay_t<decltype(std::declval<FromV>()[0])>; constexpr size_t sizeOfFromLane = sizeof(FromLaneT); static_assert(sizeOfFromLane >= 2, "sizeOfFromLane >= 2 must be true"); if constexpr(N == 1) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return reinterpret_cast<__vector unsigned char>(vect); #else return reinterpret_cast<__vector unsigned char>( vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char))); #endif } else { if constexpr(sizeOfFromLane >= 4) { return AltivecTruncateToU8<N>(vec_pack(vect, vect)); } else { const __vector unsigned short u16Vect = reinterpret_cast<__vector unsigned short>(vect); return vec_pack(u16Vect, u16Vect); } } } static int TestsFailedCount = 0; template<class T> static constexpr decltype(auto) CharToNumber(T&& val) { using DecayT = std::decay_t<T>; if constexpr(std::is_same_v<DecayT, char>) { if constexpr(std::is_signed_v<char>) return static_cast<short>(val); else return static_cast<unsigned short>(val); } else if constexpr(std::is_same_v<DecayT, signed char>) { return static_cast<short>(val); } else if constexpr(std::is_same_v<DecayT, unsigned char>) { return static_cast<unsigned short>(val); } else if constexpr(std::is_same_v<DecayT, char16_t> #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L || std::is_same_v<DecayT, char8_t> #endif ) { return static_cast<uint_least16_t>(val); } else if constexpr(std::is_same_v<DecayT, char32_t>) { return static_cast<uint_least32_t>(val); } else if constexpr(std::is_same_v<DecayT, wchar_t>) { if constexpr(std::is_signed_v<wchar_t>) return static_cast<std::make_signed_t<wchar_t>>(val); else return static_cast<std::make_unsigned_t<wchar_t>>(val); } else { return static_cast<T&&>(val); } } template<class T, size_t N> inline void PrintValuesToCout(T (&vals)[N]) { using namespace std::string_view_literals; for(size_t i = 0; i < N; i++) { if(i != 0) std::cout << ", "sv; std::cout << CharToNumber(vals[i]); } } template<class FromLaneT, size_t N> void DoTruncateToU8Test() { using namespace std::string_view_literals; { constexpr uint32_t base = 0xFA578D00; alignas(16) FromLaneT srcValues[N]; alignas(16) uint8_t expectedValues[N]; alignas(16) uint8_t actualValues[N]; for(size_t i = 0; i < N; i++) { srcValues[i] = static_cast<FromLaneT>(base + static_cast<FromLaneT>(i)); expectedValues[i] = static_cast<uint8_t>(base + static_cast<uint8_t>(i)); } auto srcVect = LoadVector<N>(srcValues); auto resultVect = AltivecTruncateToU8<N>(srcVect); StoreVector<N>(actualValues, resultVect); for(size_t i = 0; i < N; i++) { if(expectedValues[i] != actualValues[i]) { std::cout << "Test failed for uint"sv << (sizeof(FromLaneT) << 3) << "_t lane type with "sv << N << " lanes\n"sv; std::cout << "Expected values:\n "sv; PrintValuesToCout(expectedValues); std::cout << "\nActual values:\n "sv; PrintValuesToCout(actualValues); std::cout << '\n'; ++TestsFailedCount; break; } } } if constexpr(N >= 2) { DoTruncateToU8Test<FromLaneT, N / 2>(); } } int main(int argc, char** argv) { using namespace std::string_view_literals; DoTruncateToU8Test<uint64_t, 2>(); DoTruncateToU8Test<uint32_t, 4>(); DoTruncateToU8Test<uint16_t, 8>(); const auto failCnt = TestsFailedCount; std::cout << failCnt << " tests failed\n"sv; return static_cast<int>(failCnt != 0); } Here is the expected output of the above program: 0 tests failed Here is the output that is generated when the above program is compiled with gcc 12.2.1 with the -mcpu=power10 -O2 options: Test failed for uint32_t lane type with 1 lanes Expected values: 0 Actual values: 250 Test failed for uint16_t lane type with 1 lanes Expected values: 0 Actual values: 141 2 tests failed The program above does generate the correct result when compiled with the -mcpu=power9 -O2 options on the powerpc64-linux-gnu target. The C++ test program above does generate the correct result if compiled with clang 15 with the -mcpu=power10 -std=c++17 -O2 options.