https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115916
Bug ID: 115916 Summary: [15 Regression] wrong code on highway-1.2.0 Product: gcc Version: 15.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: slyfox at gcc dot gnu.org Target Milestone: --- Seemingly a recent regression. On gcc-master r15-2026-g44c9403ed1833a highway-1.2.0 testsuite fails as: The following tests FAILED: 1190 - HwyShuffle4TestGroup/HwyShuffle4Test.TestAllPer4LaneBlockShuffle/EMU128 # GetParam() = 2305843009213693952 (Subprocess aborted) Here is the minimized version extracted out of original example: // $ cat shuffle4_test.cc #include <stddef.h> #include <stdint.h> struct ve { ve() = default; ve(const ve&) = default; ve& operator=(const ve&) = default; // note that the code usually uses the first half of this array uint8_t raw[16] = {}; }; static ve First8_(void) { ve m; __builtin_memset(m.raw, 0xff, 8); return m; } static ve And_(ve a, ve b) { ve au; __builtin_memcpy(au.raw, a.raw, 16); for (size_t i = 0; i < 8; ++i) { au.raw[i] &= b.raw[i]; } return au; } __attribute__((noipa, optimize(0))) static void vec_assert(ve a) { if (a.raw[6] != 0x06 && a.raw[6] != 0x07) __builtin_trap(); } static ve Reverse4_(ve v) { ve ret; for (size_t i = 0; i < 8; i += 4) { ret.raw[i + 0] = v.raw[i + 3]; ret.raw[i + 1] = v.raw[i + 2]; ret.raw[i + 2] = v.raw[i + 1]; ret.raw[i + 3] = v.raw[i + 0]; } return ret; } static ve DupEven_(ve v) { for (size_t i = 0; i < 8; i += 2) { v.raw[i + 1] = v.raw[i]; } return v; } template <bool b> ve Per4LaneBlockShuffle_(ve v) { if (b) { return Reverse4_(v); } else { return DupEven_(v); } } template <bool b> static inline __attribute__((always_inline)) void DoTestPer4LaneBlkShuffle(const ve v) { ve actual = Per4LaneBlockShuffle_<b>(v); const auto valid_lanes_mask = First8_(); ve actual_masked = And_(valid_lanes_mask, actual); vec_assert(actual_masked); } static void DoTestPer4LaneBlkShuffles(const ve v) { alignas(128) uint8_t src_lanes[8]; __builtin_memcpy(src_lanes, v.raw, 8); // need both, hm DoTestPer4LaneBlkShuffle<true >(v); DoTestPer4LaneBlkShuffle<false>(v); } __attribute__((noipa, optimize(0))) static void bug(void) { uint8_t iv[8] = {1,2,3,4,5,6,7,8}; ve v; __builtin_memcpy(v.raw, iv, 8); DoTestPer4LaneBlkShuffles(v); } int main(void) { bug(); } Bad: $ gcc/xg++ -Bgcc shuffle4_test.cc -O3 -o bug && ./bug Illegal instruction (core dumped) Ok: $ gcc/xg++ -Bgcc shuffle4_test.cc -O2 -o bug && ./bug $ gcc/xg++ -Bgcc -v Reading specs from gcc/specs COLLECT_GCC=gcc/xg++ COLLECT_LTO_WRAPPER=gcc/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: /home/slyfox/dev/git/gcc/configure --disable-multilib --disable-bootstrap --disable-lto --disable-libsanitizer --disable-libstdcxx-pch --enable-languages=c,c++ --disable-libgomp --disable-libquadmath --disable-libvtv CFLAGS='-O1 -g0' CXXFLAGS='-O1 -g0' LDFLAGS='-O1 -g0' Thread model: posix Supported LTO compression algorithms: zlib gcc version 15.0.0 20240714 (experimental) (GCC)