This recovers some of the nearly dead code in gimple_fold_builtin_memory_op by allowing a rewrite of memcpy with a properly aligned source or destination decl. In particular this handles register typed vars to be tranformed (and later rewritten into SSA form).
Together with 1/2 the testcase then optimizes from skvx::bit_pun<__vector(4) long long int, skvx::Vec<8, unsigned int> > (const struct Vec & s) { vector(4) long long int D.151565; vector(4) long long int d; try { memcpy (&d, s, 32); D.151565 = d; return D.151565; } finally { d = {CLOBBER}; } } to skvx::bit_pun<__vector(4) long long int, skvx::Vec<8, unsigned int> > (const struct Vec & s) { vector(4) long long int d; vector(4) long long int _3; <bb 2> : _3 = MEM <vector(4) long long int> [(char * {ref-all})s_2(D)]; return _3; } instead of a weird memcpy + bit-insert combo. Bootstrap and regtest running on x86_64-unknown-linux-gnu. Richard. 2019-11-02 Richard Biener <rguent...@suse.de> PR tree-optimization/92645 * gimple-fold.c (gimple_fold_builtin_memory_op): Fold memcpy from or to a properly aligned register variable. * gcc.target/i386/pr92645-5.c: New testcase. Index: gcc/gimple-fold.c =================================================================== --- gcc/gimple-fold.c (revision 278893) +++ gcc/gimple-fold.c (working copy) @@ -987,32 +987,21 @@ gimple_fold_builtin_memory_op (gimple_st src_align = get_pointer_alignment (src); dest_align = get_pointer_alignment (dest); if (dest_align < TYPE_ALIGN (desttype) - || src_align < TYPE_ALIGN (srctype)) + && src_align < TYPE_ALIGN (srctype)) return false; destvar = NULL_TREE; + srcvar = NULL_TREE; if (TREE_CODE (dest) == ADDR_EXPR && var_decl_component_p (TREE_OPERAND (dest, 0)) - && tree_int_cst_equal (TYPE_SIZE_UNIT (desttype), len)) + && tree_int_cst_equal (TYPE_SIZE_UNIT (desttype), len) + && dest_align >= TYPE_ALIGN (desttype)) destvar = fold_build2 (MEM_REF, desttype, dest, off0); - - srcvar = NULL_TREE; - if (TREE_CODE (src) == ADDR_EXPR - && var_decl_component_p (TREE_OPERAND (src, 0)) - && tree_int_cst_equal (TYPE_SIZE_UNIT (srctype), len)) - { - if (!destvar - || src_align >= TYPE_ALIGN (desttype)) - srcvar = fold_build2 (MEM_REF, destvar ? desttype : srctype, - src, off0); - else if (!STRICT_ALIGNMENT) - { - srctype = build_aligned_type (TYPE_MAIN_VARIANT (desttype), - src_align); - srcvar = fold_build2 (MEM_REF, srctype, src, off0); - } - } - + else if (TREE_CODE (src) == ADDR_EXPR + && var_decl_component_p (TREE_OPERAND (src, 0)) + && tree_int_cst_equal (TYPE_SIZE_UNIT (srctype), len) + && src_align >= TYPE_ALIGN (srctype)) + srcvar = fold_build2 (MEM_REF, srctype, src, off0); if (srcvar == NULL_TREE && destvar == NULL_TREE) return false; Index: gcc/testsuite/gcc.target/i386/pr92645-5.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr92645-5.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr92645-5.c (working copy) @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-cddce1 -mavx2 -Wno-psabi" } */ +typedef long long v4di __attribute__((vector_size(32))); +struct Vec +{ + unsigned int v[8]; +}; + +v4di pun (struct Vec *s) +{ + v4di tem; + __builtin_memcpy (&tem, s, 32); + return tem; +} + +/* We're expecting exactly two stmts, in particular no BIT_INSERT_EXPR + and no memcpy call. + _3 = MEM <vector(4) long long int> [(char * {ref-all})s_2(D)]; + return _3; */ +/* { dg-final { scan-tree-dump-times " = MEM" 1 "cddce1" } } */ +/* { dg-final { scan-tree-dump-not "memcpy" "cddce1" } } */