https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77287
--- Comment #13 from Hongtao.liu <crazylht at gmail dot com> --- ;; Function fn (fn, funcdef_no=5484, decl_uid=32317, cgraph_uid=5485, symbol_order=5484) int fn (const int * px, const int * py, const int * pz, const int * pw, const int * pa, const int * pb, const int * pc, const int * pd) { vector(16) short unsigned int _3; vector(16) short unsigned int _5; vector(16) short int _7; vector(16) short int _9; vector(32) char _12; vector(32) unsigned char _14; vector(16) short unsigned int _16; vector(16) short unsigned int _17; vector(16) short int _18; vector(16) short int _19; vector(32) char _20; vector(32) unsigned char _21; vector(16) short unsigned int _22; vector(16) short unsigned int _23; vector(16) short int _24; vector(16) short int _25; vector(32) char _26; vector(32) unsigned char _27; vector(16) short unsigned int _28; vector(16) short unsigned int _29; vector(16) short int _30; vector(16) short int _31; int _32; vector(4) int _33; vector(8) int _34; vector(32) unsigned char _35; vector(32) char _36; vector(16) short unsigned int _37; vector(16) short unsigned int _38; vector(16) short unsigned int _39; vector(16) short unsigned int _40; vector(16) short unsigned int _41; vector(16) short unsigned int _42; vector(16) short unsigned int _43; vector(16) short unsigned int _44; vector(16) short unsigned int _45; vector(16) short unsigned int _46; vector(16) short unsigned int _47; vector(16) short unsigned int _48; vector(16) short unsigned int _50; vector(16) short unsigned int _51; vector(16) short unsigned int _53; vector(16) short unsigned int _54; vector(16) short unsigned int _56; vector(16) short unsigned int _57; vector(16) short unsigned int _59; vector(16) short unsigned int _60; vector(16) short int _62; vector(16) short int _63; vector(16) short unsigned int _64; vector(16) short unsigned int _65; vector(32) unsigned char _66; vector(32) char _67; vector(16) short int _68; vector(16) short int _69; vector(16) short unsigned int _70; vector(16) short unsigned int _71; vector(32) unsigned char _72; vector(32) char _73; vector(16) short int _74; vector(16) short int _75; vector(16) short unsigned int _76; vector(16) short unsigned int _77; vector(32) unsigned char _78; vector(32) char _79; vector(16) short int _80; vector(16) short int _81; vector(16) short unsigned int _82; vector(16) short unsigned int _83; vector(32) unsigned char _84; vector(32) char _85; vector(16) short int _86; vector(16) short int _87; vector(16) short unsigned int _88; vector(16) short unsigned int _89; vector(32) unsigned char _90; vector(32) char _91; vector(4) long long int _92; vector(4) long long int _93; vector(4) long long int _94; vector(4) long long int _95; vector(4) long long int _96; vector(4) long long int _97; vector(4) long long int _98; vector(4) long long int _99; vector(4) long long int _100; vector(4) long long int _101; vector(16) short unsigned int _107; vector(16) short unsigned int _108; vector(16) short unsigned int _109; vector(16) short unsigned int _110; vector(16) short unsigned int _111; <bb 2> [local count: 1073741824]: _101 = MEM[(const __m256i_u * {ref-all})px_2(D)]; _100 = MEM[(const __m256i_u * {ref-all})py_4(D)]; _99 = MEM[(const __m256i_u * {ref-all})pz_6(D)]; _98 = MEM[(const __m256i_u * {ref-all})pw_8(D)]; _97 = MEM[(const __m256i_u * {ref-all})pa_10(D)]; _96 = MEM[(const __m256i_u * {ref-all})pb_11(D)]; _95 = MEM[(const __m256i_u * {ref-all})pc_13(D)]; _94 = MEM[(const __m256i_u * {ref-all})pd_15(D)]; _93 = MEM[(const __m256i_u * {ref-all})pc_13(D) + 32B]; _92 = MEM[(const __m256i_u * {ref-all})pd_15(D) + 32B]; _86 = VIEW_CONVERT_EXPR<vector(16) short int>(_96); _87 = VIEW_CONVERT_EXPR<vector(16) short int>(_101); _88 = (vector(16) short unsigned int) _87; _89 = (vector(16) short unsigned int) _86; _90 = VEC_PACK_SAT_EXPR <_88, _89>; _91 = (vector(32) char) _90; _80 = VIEW_CONVERT_EXPR<vector(16) short int>(_95); _81 = VIEW_CONVERT_EXPR<vector(16) short int>(_100); _82 = (vector(16) short unsigned int) _81; _83 = (vector(16) short unsigned int) _80; _84 = VEC_PACK_SAT_EXPR <_82, _83>; _85 = (vector(32) char) _84; _74 = VIEW_CONVERT_EXPR<vector(16) short int>(_94); _75 = VIEW_CONVERT_EXPR<vector(16) short int>(_99); _76 = (vector(16) short unsigned int) _75; _77 = (vector(16) short unsigned int) _74; _78 = VEC_PACK_SAT_EXPR <_76, _77>; _79 = (vector(32) char) _78; _68 = VIEW_CONVERT_EXPR<vector(16) short int>(_93); _69 = VIEW_CONVERT_EXPR<vector(16) short int>(_98); _70 = (vector(16) short unsigned int) _69; _71 = (vector(16) short unsigned int) _68; _72 = VEC_PACK_SAT_EXPR <_70, _71>; _73 = (vector(32) char) _72; _62 = VIEW_CONVERT_EXPR<vector(16) short int>(_92); _63 = VIEW_CONVERT_EXPR<vector(16) short int>(_97); _64 = (vector(16) short unsigned int) _63; _65 = (vector(16) short unsigned int) _62; _66 = VEC_PACK_SAT_EXPR <_64, _65>; _67 = (vector(32) char) _66; _59 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_91); _60 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_101); _56 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_85); _57 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_100); _53 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_79); _54 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_99); _50 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_73); _51 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_98); _47 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_67); _48 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_97); _45 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_96); _111 = _60 - _45; _46 = _59 + _111; _43 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_95); _110 = _57 - _43; _44 = _56 + _110; _41 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_94); _109 = _54 - _41; _42 = _53 + _109; _39 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_93); _108 = _51 - _39; _40 = _50 + _108; _37 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_92); _107 = _48 - _37; _38 = _47 + _107; _9 = VIEW_CONVERT_EXPR<vector(16) short int>(_44); _7 = VIEW_CONVERT_EXPR<vector(16) short int>(_46); _5 = (vector(16) short unsigned int) _7; _3 = (vector(16) short unsigned int) _9; _35 = VEC_PACK_SAT_EXPR <_5, _3>; _36 = (vector(32) char) _35; _19 = VIEW_CONVERT_EXPR<vector(16) short int>(_42); _18 = VIEW_CONVERT_EXPR<vector(16) short int>(_36); _17 = (vector(16) short unsigned int) _18; _16 = (vector(16) short unsigned int) _19; _14 = VEC_PACK_SAT_EXPR <_17, _16>; _12 = (vector(32) char) _14; _25 = VIEW_CONVERT_EXPR<vector(16) short int>(_40); _24 = VIEW_CONVERT_EXPR<vector(16) short int>(_12); _23 = (vector(16) short unsigned int) _24; _22 = (vector(16) short unsigned int) _25; _21 = VEC_PACK_SAT_EXPR <_23, _22>; _20 = (vector(32) char) _21; _31 = VIEW_CONVERT_EXPR<vector(16) short int>(_38); _30 = VIEW_CONVERT_EXPR<vector(16) short int>(_20); _29 = (vector(16) short unsigned int) _30; _28 = (vector(16) short unsigned int) _31; _27 = VEC_PACK_SAT_EXPR <_29, _28>; _26 = (vector(32) char) _27; _34 = VIEW_CONVERT_EXPR<vector(8) int>(_26); _33 = __builtin_ia32_vextractf128_si256 (_34, 0); _32 = __builtin_ia32_vec_ext_v4si (_33, 1); [tail call] return _32; } After folding _mm256_packus_epi16, gimple still doesn't simplify it. I guess gcc only functionally supports vec_pack_sat_expr, but does not optimize it