https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98855
Martin Liška <marxin at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Ever confirmed|0 |1 Status|UNCONFIRMED |NEW Last reconfirmed| |2021-01-29 --- Comment #3 from Martin Liška <marxin at gcc dot gnu.org> --- So it's still there after the fix for PR98845. I briefly looked at src/lib/block/xtea/xtea.cpp: one can isolate the one problematic SLP: -fdbg-cnt=vect_slp:4-4 build/include/botan/loadstor.h:470:15: note: Basic block will be vectorized using SLP build/include/botan/loadstor.h:470:15: note: Vectorizing SLP tree: build/include/botan/loadstor.h:470:15: note: node 0x2cabac0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: MEM <unsigned int> [(char * {ref-all})_66] = _133; build/include/botan/loadstor.h:470:15: note: stmt 0 MEM <unsigned int> [(char * {ref-all})_66] = _133; build/include/botan/loadstor.h:470:15: note: stmt 1 MEM <unsigned int> [(char * {ref-all})_66 + 4B] = _134; build/include/botan/loadstor.h:470:15: note: stmt 2 MEM <unsigned int> [(char * {ref-all})_66 + 8B] = _135; build/include/botan/loadstor.h:470:15: note: stmt 3 MEM <unsigned int> [(char * {ref-all})_66 + 12B] = _136; build/include/botan/loadstor.h:470:15: note: stmt 4 MEM <unsigned int> [(char * {ref-all})_66 + 16B] = _137; build/include/botan/loadstor.h:470:15: note: stmt 5 MEM <unsigned int> [(char * {ref-all})_66 + 20B] = _138; build/include/botan/loadstor.h:470:15: note: stmt 6 MEM <unsigned int> [(char * {ref-all})_66 + 24B] = _139; build/include/botan/loadstor.h:470:15: note: stmt 7 MEM <unsigned int> [(char * {ref-all})_66 + 28B] = _140; build/include/botan/loadstor.h:470:15: note: children 0x2cabb40 build/include/botan/loadstor.h:470:15: note: node 0x2cabb40 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: _133 = __builtin_bswap32 (_92); build/include/botan/loadstor.h:470:15: note: stmt 0 _133 = __builtin_bswap32 (_92); build/include/botan/loadstor.h:470:15: note: stmt 1 _134 = __builtin_bswap32 (_592); build/include/botan/loadstor.h:470:15: note: stmt 2 _135 = __builtin_bswap32 (_90); build/include/botan/loadstor.h:470:15: note: stmt 3 _136 = __builtin_bswap32 (_591); build/include/botan/loadstor.h:470:15: note: stmt 4 _137 = __builtin_bswap32 (_594); build/include/botan/loadstor.h:470:15: note: stmt 5 _138 = __builtin_bswap32 (_590); build/include/botan/loadstor.h:470:15: note: stmt 6 _139 = __builtin_bswap32 (_593); build/include/botan/loadstor.h:470:15: note: stmt 7 _140 = __builtin_bswap32 (_589); build/include/botan/loadstor.h:470:15: note: children 0x2cabbc0 build/include/botan/loadstor.h:470:15: note: node 0x2cabbc0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: _92 = PHI <_14(17)> build/include/botan/loadstor.h:470:15: note: stmt 0 _92 = PHI <_14(17)> build/include/botan/loadstor.h:470:15: note: stmt 1 _592 = PHI <_46(17)> build/include/botan/loadstor.h:470:15: note: stmt 2 _90 = PHI <_23(17)> build/include/botan/loadstor.h:470:15: note: stmt 3 _591 = PHI <_52(17)> build/include/botan/loadstor.h:470:15: note: stmt 4 _594 = PHI <_31(17)> build/include/botan/loadstor.h:470:15: note: stmt 5 _590 = PHI <_58(17)> build/include/botan/loadstor.h:470:15: note: stmt 6 _593 = PHI <_37(17)> build/include/botan/loadstor.h:470:15: note: stmt 7 _589 = PHI <_64(17)> build/include/botan/loadstor.h:470:15: note: children 0x2cabc40 build/include/botan/loadstor.h:470:15: note: node 0x2cabc40 (max_nunits=4, refcnt=2) build/include/botan/loadstor.h:470:15: note: op template: _14 = _13 + L0_224; build/include/botan/loadstor.h:470:15: note: stmt 0 _14 = _13 + L0_224; build/include/botan/loadstor.h:470:15: note: stmt 1 _46 = _45 + R0_225; build/include/botan/loadstor.h:470:15: note: stmt 2 _23 = _22 + L1_226; build/include/botan/loadstor.h:470:15: note: stmt 3 _52 = _51 + R1_227; build/include/botan/loadstor.h:470:15: note: stmt 4 _31 = _30 + L2_228; build/include/botan/loadstor.h:470:15: note: stmt 5 _58 = _57 + R2_229; build/include/botan/loadstor.h:470:15: note: stmt 6 _37 = _36 + L3_230; build/include/botan/loadstor.h:470:15: note: stmt 7 _64 = _63 + R3_231; build/include/botan/loadstor.h:470:15: note: children 0x2cabcc0 0x2cabdc0 build/include/botan/loadstor.h:470:15: note: node 0x2cabcc0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: _13 = _9 ^ _12; build/include/botan/loadstor.h:470:15: note: stmt 0 _13 = _9 ^ _12; build/include/botan/loadstor.h:470:15: note: stmt 1 _45 = _41 ^ _44; build/include/botan/loadstor.h:470:15: note: stmt 2 _22 = _12 ^ _18; build/include/botan/loadstor.h:470:15: note: stmt 3 _51 = _44 ^ _50; build/include/botan/loadstor.h:470:15: note: stmt 4 _30 = _12 ^ _27; build/include/botan/loadstor.h:470:15: note: stmt 5 _57 = _44 ^ _56; build/include/botan/loadstor.h:470:15: note: stmt 6 _36 = _12 ^ _35; build/include/botan/loadstor.h:470:15: note: stmt 7 _63 = _44 ^ _62; build/include/botan/loadstor.h:470:15: note: children 0x2cabd40 0x2cabfc0 build/include/botan/loadstor.h:470:15: note: node (external) 0x2cabd40 (max_nunits=1, refcnt=1) build/include/botan/loadstor.h:470:15: note: { _9, _41, _18, _50, _27, _56, _35, _62 } build/include/botan/loadstor.h:470:15: note: node (external) 0x2cabfc0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: stmt 0 _12 = *_11; build/include/botan/loadstor.h:470:15: note: stmt 1 _44 = *_43; build/include/botan/loadstor.h:470:15: note: stmt 2 _12 = *_11; build/include/botan/loadstor.h:470:15: note: stmt 3 _44 = *_43; build/include/botan/loadstor.h:470:15: note: stmt 4 _12 = *_11; build/include/botan/loadstor.h:470:15: note: stmt 5 _44 = *_43; build/include/botan/loadstor.h:470:15: note: stmt 6 _12 = *_11; build/include/botan/loadstor.h:470:15: note: stmt 7 _44 = *_43; build/include/botan/loadstor.h:470:15: note: node 0x2cabdc0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: L0_224 = PHI <_14(28), _118(16)> build/include/botan/loadstor.h:470:15: note: stmt 0 L0_224 = PHI <_14(28), _118(16)> build/include/botan/loadstor.h:470:15: note: stmt 1 R0_225 = PHI <_46(28), _120(16)> build/include/botan/loadstor.h:470:15: note: stmt 2 L1_226 = PHI <_23(28), _122(16)> build/include/botan/loadstor.h:470:15: note: stmt 3 R1_227 = PHI <_52(28), _124(16)> build/include/botan/loadstor.h:470:15: note: stmt 4 L2_228 = PHI <_31(28), _126(16)> build/include/botan/loadstor.h:470:15: note: stmt 5 R2_229 = PHI <_58(28), _128(16)> build/include/botan/loadstor.h:470:15: note: stmt 6 L3_230 = PHI <_37(28), _130(16)> build/include/botan/loadstor.h:470:15: note: stmt 7 R3_231 = PHI <_64(28), _132(16)> build/include/botan/loadstor.h:470:15: note: children 0x2cabc40 0x2cac040 build/include/botan/loadstor.h:470:15: note: node 0x2cac040 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: _118 = __builtin_bswap32 (_117); build/include/botan/loadstor.h:470:15: note: stmt 0 _118 = __builtin_bswap32 (_117); build/include/botan/loadstor.h:470:15: note: stmt 1 _120 = __builtin_bswap32 (_119); build/include/botan/loadstor.h:470:15: note: stmt 2 _122 = __builtin_bswap32 (_121); build/include/botan/loadstor.h:470:15: note: stmt 3 _124 = __builtin_bswap32 (_123); build/include/botan/loadstor.h:470:15: note: stmt 4 _126 = __builtin_bswap32 (_125); build/include/botan/loadstor.h:470:15: note: stmt 5 _128 = __builtin_bswap32 (_127); build/include/botan/loadstor.h:470:15: note: stmt 6 _130 = __builtin_bswap32 (_129); build/include/botan/loadstor.h:470:15: note: stmt 7 _132 = __builtin_bswap32 (_131); build/include/botan/loadstor.h:470:15: note: children 0x2cac0c0 build/include/botan/loadstor.h:470:15: note: node 0x2cac0c0 (max_nunits=4, refcnt=1) build/include/botan/loadstor.h:470:15: note: op template: _117 = MEM <unsigned int> [(char * {ref-all})_5]; build/include/botan/loadstor.h:470:15: note: stmt 0 _117 = MEM <unsigned int> [(char * {ref-all})_5]; build/include/botan/loadstor.h:470:15: note: stmt 1 _119 = MEM <unsigned int> [(char * {ref-all})_5 + 4B]; build/include/botan/loadstor.h:470:15: note: stmt 2 _121 = MEM <unsigned int> [(char * {ref-all})_5 + 8B]; build/include/botan/loadstor.h:470:15: note: stmt 3 _123 = MEM <unsigned int> [(char * {ref-all})_5 + 12B]; build/include/botan/loadstor.h:470:15: note: stmt 4 _125 = MEM <unsigned int> [(char * {ref-all})_5 + 16B]; build/include/botan/loadstor.h:470:15: note: stmt 5 _127 = MEM <unsigned int> [(char * {ref-all})_5 + 20B]; build/include/botan/loadstor.h:470:15: note: stmt 6 _129 = MEM <unsigned int> [(char * {ref-all})_5 + 24B]; build/include/botan/loadstor.h:470:15: note: stmt 7 _131 = MEM <unsigned int> [(char * {ref-all})_5 + 28B]; build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _13 = _9 ^ _12; build/include/botan/loadstor.h:470:15: note: transform binary/unary operation. build/include/botan/loadstor.h:470:15: note: add new stmt: vect__13.606_578 = _580 ^ _582; build/include/botan/loadstor.h:470:15: note: add new stmt: vect__13.606_577 = _579 ^ _581; build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _117 = MEM <unsigned int> [(char * {ref-all})_5]; build/include/botan/loadstor.h:470:15: note: transform load. ncopies = 1 build/include/botan/loadstor.h:470:15: note: create vector_type-pointer variable to type: vector(4) unsigned int vectorizing a pointer ref: MEM <unsigned int> [(char * {ref-all})_5] build/include/botan/loadstor.h:470:15: note: created vectp.608_575 build/include/botan/loadstor.h:470:15: note: add new stmt: vect__117.609_574 = MEM <vector(4) unsigned int> [(char * {ref-all})vectp.608_575]; build/include/botan/loadstor.h:470:15: note: add new stmt: vectp.608_573 = vectp.608_575 + 16; build/include/botan/loadstor.h:470:15: note: add new stmt: vect__117.610_572 = MEM <vector(4) unsigned int> [(char * {ref-all})vectp.608_573]; build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _118 = __builtin_bswap32 (_117); build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand MEM <unsigned int> [(char * {ref-all})_5], type of def: internal build/include/botan/loadstor.h:470:15: note: add new stmt: _571 = VIEW_CONVERT_EXPR<vector(16) char>(vect__117.609_574); build/include/botan/loadstor.h:470:15: note: add new stmt: _570 = VEC_PERM_EXPR <_571, _571, { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }>; build/include/botan/loadstor.h:470:15: note: add new stmt: _569 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_570); build/include/botan/loadstor.h:470:15: note: add new stmt: _568 = VIEW_CONVERT_EXPR<vector(16) char>(vect__117.610_572); build/include/botan/loadstor.h:470:15: note: add new stmt: _567 = VEC_PERM_EXPR <_568, _568, { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }>; build/include/botan/loadstor.h:470:15: note: add new stmt: _566 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_567); build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: L0_224 = PHI <_14(28), _118(16)> build/include/botan/loadstor.h:470:15: note: extracting lane for live stmt R0_225 = PHI <_46(28), _120(16)> build/include/botan/loadstor.h:470:15: note: extracting lane for live stmt R1_227 = PHI <_52(28), _124(16)> build/include/botan/loadstor.h:470:15: note: extracting lane for live stmt R2_229 = PHI <_58(28), _128(16)> build/include/botan/loadstor.h:470:15: note: extracting lane for live stmt R3_231 = PHI <_64(28), _132(16)> build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _14 = _13 + L0_224; build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand _9 ^ _12, type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand L0_224 = PHI <_14(28), _118(16)>, type of def: internal build/include/botan/loadstor.h:470:15: note: transform binary/unary operation. build/include/botan/loadstor.h:470:15: note: add new stmt: vect__14.612_559 = vect__13.606_578 + vect_L0_224.611_565; build/include/botan/loadstor.h:470:15: note: add new stmt: vect__14.612_558 = vect__13.606_577 + vect_L0_224.611_564; build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _92 = PHI <_14(17)> build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: _133 = __builtin_bswap32 (_92); build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand _92 = PHI <_14(17)>, type of def: internal build/include/botan/loadstor.h:470:15: note: add new stmt: _555 = VIEW_CONVERT_EXPR<vector(16) char>(vect__92.613_557); build/include/botan/loadstor.h:470:15: note: add new stmt: _554 = VEC_PERM_EXPR <_555, _555, { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }>; build/include/botan/loadstor.h:470:15: note: add new stmt: _553 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_554); build/include/botan/loadstor.h:470:15: note: add new stmt: _552 = VIEW_CONVERT_EXPR<vector(16) char>(vect__92.613_556); build/include/botan/loadstor.h:470:15: note: add new stmt: _551 = VEC_PERM_EXPR <_552, _552, { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }>; build/include/botan/loadstor.h:470:15: note: add new stmt: _550 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_551); build/include/botan/loadstor.h:470:15: note: ------>vectorizing SLP node starting from: MEM <unsigned int> [(char * {ref-all})_66] = _133; build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_92), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_592), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_90), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_591), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_594), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_590), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_593), type of def: internal build/include/botan/loadstor.h:470:15: note: vect_is_simple_use: operand __builtin_bswap32 (_589), type of def: internal build/include/botan/loadstor.h:470:15: note: transform store. ncopies = 1 build/include/botan/loadstor.h:470:15: note: create vector_type-pointer variable to type: vector(4) unsigned int vectorizing a pointer ref: MEM <unsigned int> [(char * {ref-all})_66] build/include/botan/loadstor.h:470:15: note: created vectp.615_549 build/include/botan/loadstor.h:470:15: note: add new stmt: MEM <vector(4) unsigned int> [(char * {ref-all})vectp.615_549] = _553; build/include/botan/loadstor.h:470:15: note: add new stmt: vectp.615_547 = vectp.615_549 + 16; build/include/botan/loadstor.h:470:15: note: add new stmt: MEM <vector(4) unsigned int> [(char * {ref-all})vectp.615_547] = _550; build/include/botan/loadstor.h:470:15: note: vectorizing stmts using SLP. build/include/botan/loadstor.h:470:15: optimized: basic block part vectorized using 16 byte vectors I tried to isolate a self-contained test-case, but I was not lucky enough. The original loop: void XTEA::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { verify_key_set(m_EK.empty() == false); const uint32_t* EK = &m_EK[0]; const size_t blocks4 = blocks / 4; const size_t blocks_left = blocks % 4; BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks4; i++) { uint32_t L0, R0, L1, R1, L2, R2, L3, R3; load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); for(size_t r = 0; r != 32; ++r) { L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r]; L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r]; L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r]; L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r]; R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1]; R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1]; R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1]; R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1]; } store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); } BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i) { uint32_t L, R; load_be(in + BLOCK_SIZE*(4*blocks4+i), L, R); for(size_t r = 0; r != 32; ++r) { L += (((R << 4) ^ (R >> 5)) + R) ^ EK[2*r]; R += (((L << 4) ^ (L >> 5)) + L) ^ EK[2*r+1]; } store_be(out + BLOCK_SIZE*(4*blocks4+i), L, R); } } ==== BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks_left; ++i) should not be executed, the benchmark runs it with block == 128