On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guent...@gmail.com> wrote: > > On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt > <will_schm...@vnet.ibm.com> wrote: >> Hi, >> >> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE >> >> Folding of vector loads in GIMPLE. >> >> Add code to handle gimple folding for the vec_ld builtins. >> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding >> comments have been adjusted slightly so they continue to read OK for the >> existing vec_st code. >> >> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c >> tests which have been posted separately. >> >> For V2 of this patch, I've removed the chunk of code that prohibited the >> gimple fold from occurring in BE environments. This had fixed an issue >> for me earlier during my development of the code, and turns out this was >> not necessary. I've sniff-tested after removing that check and it looks >> OK. >> >>> + /* Limit folding of loads to LE targets. */ >>> + if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG) >>> + return false; >> >> I've restarted a regression test on this updated version. >> >> OK for trunk (assuming successful regression test completion) ? >> >> Thanks, >> -Will >> >> [gcc] >> >> 2017-09-12 Will Schmidt <will_schm...@vnet.ibm.com> >> >> * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling >> for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*). >> * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): >> Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD. >> >> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c >> index fbab0a2..bb8a77d 100644 >> --- a/gcc/config/rs6000/rs6000-c.c >> +++ b/gcc/config/rs6000/rs6000-c.c >> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, >> tree fndecl, >> convert (TREE_TYPE (stmt), arg0)); >> stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); >> return stmt; >> } >> >> - /* Expand vec_ld into an expression that masks the address and >> - performs the load. We need to expand this early to allow >> + /* Expand vec_st into an expression that masks the address and >> + performs the store. We need to expand this early to allow >> the best aliasing, as by the time we get into RTL we no longer >> are able to honor __restrict__, for example. We may want to >> consider this for all memory access built-ins. >> >> When -maltivec=be is specified, or the wrong number of arguments >> is provided, simply punt to existing built-in processing. */ >> - if (fcode == ALTIVEC_BUILTIN_VEC_LD >> - && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG) >> - && nargs == 2) >> - { >> - tree arg0 = (*arglist)[0]; >> - tree arg1 = (*arglist)[1]; >> - >> - /* Strip qualifiers like "const" from the pointer arg. */ >> - tree arg1_type = TREE_TYPE (arg1); >> - if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != >> ARRAY_TYPE) >> - goto bad; >> - >> - tree inner_type = TREE_TYPE (arg1_type); >> - if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0) >> - { >> - arg1_type = build_pointer_type (build_qualified_type (inner_type, >> - 0)); >> - arg1 = fold_convert (arg1_type, arg1); >> - } >> - >> - /* Construct the masked address. Let existing error handling take >> - over if we don't have a constant offset. */ >> - arg0 = fold (arg0); >> - >> - if (TREE_CODE (arg0) == INTEGER_CST) >> - { >> - if (!ptrofftype_p (TREE_TYPE (arg0))) >> - arg0 = build1 (NOP_EXPR, sizetype, arg0); >> - >> - tree arg1_type = TREE_TYPE (arg1); >> - if (TREE_CODE (arg1_type) == ARRAY_TYPE) >> - { >> - arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type)); >> - tree const0 = build_int_cstu (sizetype, 0); >> - tree arg1_elt0 = build_array_ref (loc, arg1, const0); >> - arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0); >> - } >> - >> - tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type, >> - arg1, arg0); >> - tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr, >> - build_int_cst (arg1_type, -16)); >> - >> - /* Find the built-in to get the return type so we can convert >> - the result properly (or fall back to default handling if the >> - arguments aren't compatible). */ >> - for (desc = altivec_overloaded_builtins; >> - desc->code && desc->code != fcode; desc++) >> - continue; >> - >> - for (; desc->code == fcode; desc++) >> - if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1) >> - && (rs6000_builtin_type_compatible (TREE_TYPE (arg1), >> - desc->op2))) >> - { >> - tree ret_type = rs6000_builtin_type (desc->ret_type); >> - if (TYPE_MODE (ret_type) == V2DImode) >> - /* Type-based aliasing analysis thinks vector long >> - and vector long long are different and will put them >> - in distinct alias classes. Force our return type >> - to be a may-alias type to avoid this. */ >> - ret_type >> - = build_pointer_type_for_mode (ret_type, Pmode, >> - true/*can_alias_all*/); >> - else >> - ret_type = build_pointer_type (ret_type); >> - aligned = build1 (NOP_EXPR, ret_type, aligned); >> - tree ret_val = build_indirect_ref (loc, aligned, RO_NULL); >> - return ret_val; >> - } >> - } >> - } >> >> - /* Similarly for stvx. */ >> if (fcode == ALTIVEC_BUILTIN_VEC_ST >> && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG) >> && nargs == 3) >> { >> tree arg0 = (*arglist)[0]; >> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c >> index 1338371..1fb5f44 100644 >> --- a/gcc/config/rs6000/rs6000.c >> +++ b/gcc/config/rs6000/rs6000.c >> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator >> *gsi) >> res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res); >> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); >> update_call_from_tree (gsi, res); >> return true; >> } >> + /* Vector loads. */ >> + case ALTIVEC_BUILTIN_LVX_V16QI: >> + case ALTIVEC_BUILTIN_LVX_V8HI: >> + case ALTIVEC_BUILTIN_LVX_V4SI: >> + case ALTIVEC_BUILTIN_LVX_V4SF: >> + case ALTIVEC_BUILTIN_LVX_V2DI: >> + case ALTIVEC_BUILTIN_LVX_V2DF: >> + { >> + gimple *g; >> + arg0 = gimple_call_arg (stmt, 0); // offset >> + arg1 = gimple_call_arg (stmt, 1); // address >> + >> + lhs = gimple_call_lhs (stmt); >> + location_t loc = gimple_location (stmt); >> + >> + tree arg1_type = TREE_TYPE (arg1); >> + tree lhs_type = TREE_TYPE (lhs); >> + >> + /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'. >> Create >> + the tree using the value from arg0. The resulting type will >> match >> + the type of arg1. */ >> + tree temp_offset = create_tmp_reg_or_ssa_name (sizetype); >> + g = gimple_build_assign (temp_offset, NOP_EXPR, arg0); >> + gimple_set_location (g, loc); >> + gsi_insert_before (gsi, g, GSI_SAME_STMT); >> + tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type); >> + g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1, >> + temp_offset); >> + gimple_set_location (g, loc); >> + gsi_insert_before (gsi, g, GSI_SAME_STMT); >> + >> + /* Mask off any lower bits from the address. */ >> + tree alignment_mask = build_int_cst (arg1_type, -16); >> + tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type); >> + g = gimple_build_assign (aligned_addr, BIT_AND_EXPR, >> + temp_addr, alignment_mask); >> + gimple_set_location (g, loc); >> + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > You could use > > gimple_seq stmts = NULL; > tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0); > tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR, > arg1_type, arg1, temp_offset); > tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR, > arg1_type, temp_addr, build_int_cst (arg1_type, -16)); > gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > >> + /* Use the build2 helper to set up the mem_ref. The MEM_REF could >> also >> + take an offset, but since we've already incorporated the offset >> + above, here we just pass in a zero. */ >> + g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, >> aligned_addr, >> + build_int_cst (arg1_type, >> 0))); > > are you sure about arg1_type here? I'm sure not. For > > ... foo (struct S *p) > { > return __builtin_lvx_v2df (4, (double *)p); > } > > you'd end up with p as arg1 and thus struct S * as arg1_type and thus > TBAA using 'struct S' to access the memory.
Hm, is that so? Wouldn't arg1_type be double* since arg1 is (double *)p? Will, you should probably test this example and see, but I'm pretty confident about this (see below). > > I think if the builtins have any TBAA constraints you need to build those > explicitely, if not, you should use ptr_type_node aka no TBAA. The type signatures are constrained during parsing, so we should only see allowed pointer types on arg1 by the time we get to gimple folding. I think that using arg1_type should work, but I am probably missing something subtle, so please feel free to whack me on the temple until I get it. :-) Bill > > Richard. > >> + gimple_set_location (g, loc); >> + gsi_replace (gsi, g, true); >> + >> + return true; >> + >> + } >> + >> default: >> if (TARGET_DEBUG_BUILTIN) >> fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n", >> fn_code, fn_name1, fn_name2); >> break;