The following fixes the reduced testcase in PR92645 (but not the original C++ one because of abstraction - digging into that).
It teaches simplify_vector_constructor to consider all kinds of conversions, even those changing the element size. Since we now have truncate and extend optabs for vector types the existing code should already deal with those if the target supports it. Until x86 does so I've teached simplify_vector_constructor to handle the simple case of a non-permutated conversion via VEC_UNPACK_* and VEC_PACK_TRUNC_EXPR. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2019-11-28 Richard Biener <rguent...@suse.de> PR tree-optimization/92645 * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle conversions inside a mode class. Remove restriction on preserving the element size. (simplify_vector_constructor): Deal with the above and for identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR and VEC_PACK_TRUNC_EXPR. * gcc.target/i386/pr92645-4.c: New testcase. Index: gcc/tree-ssa-forwprop.c =================================================================== --- gcc/tree-ssa-forwprop.c (revision 278765) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -2004,16 +2004,12 @@ get_bit_field_ref_def (tree val, enum tr return NULL_TREE; enum tree_code code = gimple_assign_rhs_code (def_stmt); if (code == FLOAT_EXPR - || code == FIX_TRUNC_EXPR) + || code == FIX_TRUNC_EXPR + || CONVERT_EXPR_CODE_P (code)) { tree op1 = gimple_assign_rhs1 (def_stmt); if (conv_code == ERROR_MARK) - { - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))), - GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) - return NULL_TREE; - conv_code = code; - } + conv_code = code; else if (conv_code != code) return NULL_TREE; if (TREE_CODE (op1) != SSA_NAME) @@ -2078,9 +2074,8 @@ simplify_vector_constructor (gimple_stmt && VECTOR_TYPE_P (TREE_TYPE (ref)) && useless_type_conversion_p (TREE_TYPE (op1), TREE_TYPE (TREE_TYPE (ref))) - && known_eq (bit_field_size (op1), elem_size) && constant_multiple_p (bit_field_offset (op1), - elem_size, &elem) + bit_field_size (op1), &elem) && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts)) { unsigned int j; @@ -2153,7 +2148,83 @@ simplify_vector_constructor (gimple_stmt if (conv_code != ERROR_MARK && !supportable_convert_operation (conv_code, type, conv_src_type, &conv_code)) - return false; + { + /* Only few targets implement direct conversion patterns so try + some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR. */ + optab optab; + tree halfvectype, dblvectype; + if (CONVERT_EXPR_CODE_P (conv_code) + && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0]))) + == TYPE_PRECISION (TREE_TYPE (type))) + && mode_for_vector (as_a <scalar_mode> + (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))), + nelts * 2).exists () + && (dblvectype + = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), + nelts * 2)) + && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type)) + ? VEC_UNPACK_FLOAT_LO_EXPR + : VEC_UNPACK_LO_EXPR, + dblvectype, + optab_default)) + && (optab_handler (optab, TYPE_MODE (dblvectype)) + != CODE_FOR_nothing)) + { + gimple_seq stmts = NULL; + tree dbl; + if (refnelts == nelts) + { + /* ??? Paradoxical subregs don't exist, so insert into + the lower half of a wider zero vector. */ + dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype, + build_zero_cst (dblvectype), orig[0], + bitsize_zero_node); + } + else if (refnelts == 2 * nelts) + dbl = orig[0]; + else + dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype, + orig[0], TYPE_SIZE (dblvectype), + bitsize_zero_node); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops (gsi, + FLOAT_TYPE_P (TREE_TYPE (type)) + ? VEC_UNPACK_FLOAT_LO_EXPR + : VEC_UNPACK_LO_EXPR, + dbl); + } + else if (CONVERT_EXPR_CODE_P (conv_code) + && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0]))) + == 2 * TYPE_PRECISION (TREE_TYPE (type))) + && mode_for_vector (as_a <scalar_mode> + (TYPE_MODE + (TREE_TYPE (TREE_TYPE (orig[0])))), + nelts / 2).exists () + && (halfvectype + = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), + nelts / 2)) + && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, + halfvectype, + optab_default)) + && (optab_handler (optab, TYPE_MODE (halfvectype)) + != CODE_FOR_nothing)) + { + gimple_seq stmts = NULL; + tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype, + orig[0], TYPE_SIZE (halfvectype), + bitsize_zero_node); + tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype, + orig[0], TYPE_SIZE (halfvectype), + TYPE_SIZE (halfvectype)); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR, + low, hig); + } + else + return false; + update_stmt (gsi_stmt (*gsi)); + return true; + } if (nelts != refnelts) { gassign *lowpart @@ -2178,9 +2249,8 @@ simplify_vector_constructor (gimple_stmt ? perm_type : build_vector_type (TREE_TYPE (perm_type), nelts)); if (conv_code != ERROR_MARK - && (!supportable_convert_operation (conv_code, type, conv_src_type, - &conv_code) - || conv_code == CALL_EXPR)) + && !supportable_convert_operation (conv_code, type, conv_src_type, + &conv_code)) return false; /* Now that we know the number of elements of the source build the Index: gcc/testsuite/gcc.target/i386/pr92645-4.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr92645-4.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr92645-4.c (working copy) @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */ + +typedef unsigned int u32v4 __attribute__((vector_size(16))); +typedef unsigned short u16v16 __attribute__((vector_size(32))); +typedef unsigned char u8v16 __attribute__((vector_size(16))); + +union vec128 { + u8v16 u8; + u32v4 u32; +}; + +#define memcpy __builtin_memcpy + +static u16v16 zxt(u8v16 x) +{ + return (u16v16) { + x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + }; +} + +static u8v16 narrow(u16v16 x) +{ + return (u8v16) { + x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + }; +} + +void f(char *dst, char *src, unsigned long n, unsigned c) +{ + unsigned ia = 255 - (c >> 24); + ia += ia >> 7; + + union vec128 c4 = {0}, ia16 = {0}; + c4.u32 += c; + ia16.u8 += (unsigned char)ia; + + u16v16 c16 = (zxt(c4.u8) << 8) + 128; + + for (; n; src += 16, dst += 16, n -= 4) { + union vec128 s; + memcpy(&s, src, sizeof s); + s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8); + memcpy(dst, &s, sizeof s); + } +} + +/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */ +/* We're missing an opportunity to, after later optimizations, combine + a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted + element. */ +/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */