https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115843
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> --- The loops are for (i = 0; i < 64; i++) { KnightMoves[i] = 0; if (Rank(i) > 0) { if (Rank(i) > 1) { if (File(i) > 0) KnightMoves[i] |= Mask[i-17]; if (File(i) < 7) KnightMoves[i] |= Mask[i-15]; } if (File(i) > 1) KnightMoves[i] |= Mask[i-10]; if (File(i) < 6) KnightMoves[i] |= Mask[i-6]; } if (Rank(i) < 7) { if (Rank(i) < 6) { if (File(i) > 0) KnightMoves[i] |= Mask[i+15]; if (File(i) < 7) KnightMoves[i] |= Mask[i+17]; } if (File(i) > 1) KnightMoves[i] |= Mask[i+6]; if (File(i) < 6) KnightMoves[i] |= Mask[i+10]; } } for (i = 0; i < 64; i++) { if (File(i) == FileA) { KingPressureMask[i] = KingSafetyMask[i + 1]; } else if (File(i) == FileH) { KingPressureMask[i] = KingSafetyMask[i - 1]; } else { KingPressureMask[i] = KingSafetyMask[i]; } } for (i = 0; i < 64; i++) { if (File(i) == FileA) { KingPressureMask1[i] = KingSafetyMask1[i + 1]; } else if (File(i) == FileH) { KingPressureMask1[i] = KingSafetyMask1[i - 1]; } else { KingPressureMask1[i] = KingSafetyMask1[i]; } } the last loop is <bb 302> [local count: 145013]: <bb 183> [local count: 9271420]: # i_38 = PHI <_1526(215), 0(302)> # ivtmp_1427 = PHI <ivtmp_1430(215), 64(302)> _296 = i_38 & 7; _1526 = i_38 + 1; _380 = _296 == 0; _1371 = &KingSafetyMask1[_1526]; _298 = .MASK_LOAD (_1371, 64B, _380); _804 = _296 == 7; _1370 = (unsigned int) i_38; _1369 = _1370 + 4294967295; _299 = (int) _1369; _1368 = &KingSafetyMask1[_299]; _300 = .MASK_LOAD (_1368, 64B, _804); _301 = KingSafetyMask1[i_38]; _ifc__1431 = _804 ? _300 : _301; _336 = _380 ? _298 : _ifc__1431; KingPressureMask1[i_38] = _336; ivtmp_1430 = ivtmp_1427 - 1; if (ivtmp_1430 != 0) goto <bb 215>; [98.44%] else goto <bb 189>; [1.56%] <bb 215> [local count: 9126407]: goto <bb 183>; [100.00%] vectorized as <bb 183> [local count: 579464]: # vect_vec_iv_.194_1737 = PHI <_1915(215), { -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0 }(198)> # vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -112B](198)> # vectp_KingSafetyMask1.204_1878 = PHI <vectp_KingSafetyMask1.204_1879(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -128B](198)> # vectp_KingSafetyMask1.208_2015 = PHI <vectp_KingSafetyMask1.208_2017(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -120B](198)> # vectp_KingPressureMask1.216_2023 = PHI <vectp_KingPressureMask1.216_2025(215), &MEM <BITBOARD[64]> [(void *)&KingPressureMask1 + -120B](198)> # ivtmp_2028 = PHI <ivtmp_2030(215), 79(198)> # loop_mask_1995 = PHI <_1989(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)> # loop_mask_1860 = PHI <_1990(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)> _1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; mask_patt_1854.197_1855 = [vec_unpack_lo_expr] mask__380.196_1920; mask_patt_1854.197_1733 = [vec_unpack_hi_expr] mask__380.196_1920; vec_mask_and_1997 = mask_patt_1854.197_1855 & loop_mask_1860; vect_patt_1732.200_1998 = .MASK_LOAD (vectp_KingSafetyMask1.198_1768, 128B, vec_mask_and_1997); vectp_KingSafetyMask1.198_1865 = vectp_KingSafetyMask1.198_1768 + 64; vec_mask_and_2002 = mask_patt_1854.197_1733 & loop_mask_1995; vect_patt_1732.201_2003 = .MASK_LOAD (vectp_KingSafetyMask1.198_1865, 128B, vec_mask_and_2002); mask__804.202_1876 = vect__296.195_1901 == { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask_patt_1734.203_2005 = [vec_unpack_lo_expr] mask__804.202_1876; mask_patt_1734.203_2007 = [vec_unpack_hi_expr] mask__804.202_1876; vec_mask_and_2010 = mask_patt_1734.203_2005 & loop_mask_1860; vect_patt_1772.206_2012 = .MASK_LOAD (vectp_KingSafetyMask1.204_1878, 512B, vec_mask_and_2010); vectp_KingSafetyMask1.204_2013 = vectp_KingSafetyMask1.204_1878 + 64; vec_mask_and_1980 = mask_patt_1734.203_2007 & loop_mask_1995; vect_patt_1772.207_1981 = .MASK_LOAD (vectp_KingSafetyMask1.204_2013, 512B, vec_mask_and_1980); vect__301.210_1882 = .MASK_LOAD (vectp_KingSafetyMask1.208_2015, 64B, loop_mask_1860); vectp_KingSafetyMask1.208_2018 = vectp_KingSafetyMask1.208_2015 + 64; vect__301.211_2019 = .MASK_LOAD (vectp_KingSafetyMask1.208_2018, 64B, loop_mask_1995); vect_patt_1775.213_2021 = VEC_COND_EXPR <mask_patt_1734.203_2005, vect_patt_1772.206_2012, vect__301.210_1882>; vect_patt_1775.213_2022 = VEC_COND_EXPR <mask_patt_1734.203_2007, vect_patt_1772.207_1981, vect__301.211_2019>; vect_patt_1897.215_1984 = VEC_COND_EXPR <mask_patt_1854.197_1855, vect_patt_1732.200_1998, vect_patt_1775.213_2021>; vect_patt_1897.215_1985 = VEC_COND_EXPR <mask_patt_1854.197_1733, vect_patt_1732.201_2003, vect_patt_1775.213_2022>; .MASK_STORE (vectp_KingPressureMask1.216_2023, 64B, loop_mask_1860, vect_patt_1897.215_1984); vectp_KingPressureMask1.216_2026 = vectp_KingPressureMask1.216_2023 + 64; .MASK_STORE (vectp_KingPressureMask1.216_2026, 64B, loop_mask_1995, vect_patt_1897.215_1985); vectp_KingSafetyMask1.198_1859 = vectp_KingSafetyMask1.198_1865 + 64; vectp_KingSafetyMask1.204_1879 = vectp_KingSafetyMask1.204_2013 + 64; vectp_KingSafetyMask1.208_2017 = vectp_KingSafetyMask1.208_2018 + 64; vectp_KingPressureMask1.216_2025 = vectp_KingPressureMask1.216_2026 + 64; ivtmp_2030 = ivtmp_2028 - 16; _2031 = (unsigned short) ivtmp_2030; _1988 = {_2031, _2031, _2031, _2031, _2031, _2031, _2031, _2031}; _1989 = { 8, 9, 10, 11, 12, 13, 14, 15 } < _1988; _1990 = { 0, 1, 2, 3, 4, 5, 6, 7 } < _1988; if (ivtmp_2028 > 16) goto <bb 215>; [74.97%] else goto <bb 529>; [25.03%] <bb 215> [local count: 434451]: goto <bb 183>; [100.00%] and with -mtune=cascadelake -mprefer-vector-width=512 we avoid the failure, generating <bb 183> [local count: 435039]: # i_38 = PHI <_1526(215), 0(198)> # ivtmp_1427 = PHI <ivtmp_1430(215), 64(198)> # vect_vec_iv_.194_1737 = PHI <_1915(215), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }(198)> # vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + 8B](198)> # vectp_KingSafetyMask1.204_1876 = PHI <vectp_KingSafetyMask1.204_2005(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -8B](198)> # vectp_KingSafetyMask1.208_1879 = PHI <vectp_KingSafetyMask1.208_2010(215), &KingSafetyMask1(198)> # vectp_KingPressureMask1.216_2020 = PHI <vectp_KingPressureMask1.216_2021(215), &KingPressureMask1(198)> # ivtmp_1984 = PHI <ivtmp_1985(215), 0(198)> _1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ... the difference is peeling for alignment (which is an odd thing to do here, but ...).