Ilya Leoshkevich <i...@linux.ibm.com> writes: >> Am 26.08.2019 um 15:17 schrieb Ilya Leoshkevich <i...@linux.ibm.com>: >> >>> Am 26.08.2019 um 15:06 schrieb Richard Biener <richard.guent...@gmail.com>: >>> >>> On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <i...@linux.ibm.com> wrote: >>>> >>>>> Am 26.08.2019 um 10:49 schrieb Richard Biener >>>>> <richard.guent...@gmail.com>: >>>>> >>>>> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <i...@linux.ibm.com> >>>>> wrote: >>>>>> >>>>>>> Am 23.08.2019 um 13:24 schrieb Richard Biener >>>>>>> <richard.guent...@gmail.com>: >>>>>>> >>>>>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford >>>>>>> <richard.sandif...@arm.com> wrote: >>>>>>>> >>>>>>>> Ilya Leoshkevich <i...@linux.ibm.com> writes: >>>>>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, >>>>>>>>> machine_mode mode, >>>>>>>>> return 0; >>>>>>>>> } >>>>>>>>> >>>>>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the >>>>>>>>> the back-end >>>>>>>>> + in order to determine its capabilities. In order to avoid >>>>>>>>> creating fake >>>>>>>>> + operations on each call, values from previous calls are cached in >>>>>>>>> a global >>>>>>>>> + cached_binops hash_table. It contains rtxes, which can be looked >>>>>>>>> up using >>>>>>>>> + binop_keys. */ >>>>>>>>> + >>>>>>>>> +struct binop_key { >>>>>>>>> + enum rtx_code code; /* Operation code. */ >>>>>>>>> + machine_mode value_mode; /* Result mode. */ >>>>>>>>> + machine_mode cmp_op_mode; /* Operand mode. */ >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> { >>>>>>>>> + typedef rtx value_type; >>>>>>>>> + typedef binop_key compare_type; >>>>>>>>> + >>>>>>>>> + static hashval_t >>>>>>>>> + hash (enum rtx_code code, machine_mode value_mode, machine_mode >>>>>>>>> cmp_op_mode) >>>>>>>>> + { >>>>>>>>> + inchash::hash hstate (0); >>>>>>>>> + hstate.add_int (code); >>>>>>>>> + hstate.add_int (value_mode); >>>>>>>>> + hstate.add_int (cmp_op_mode); >>>>>>>>> + return hstate.end (); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + static hashval_t >>>>>>>>> + hash (const rtx &ref) >>>>>>>>> + { >>>>>>>>> + return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP >>>>>>>>> (ref, 0))); >>>>>>>>> + } >>>>>>>>> + >>>>>>>>> + static bool >>>>>>>>> + equal (const rtx &ref1, const binop_key &ref2) >>>>>>>>> + { >>>>>>>>> + return (GET_CODE (ref1) == ref2.code) >>>>>>>>> + && (GET_MODE (ref1) == ref2.value_mode) >>>>>>>>> + && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode); >>>>>>>>> + } >>>>>>>>> +}; >>>>>>>>> + >>>>>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops; >>>>>>>>> + >>>>>>>>> +static rtx >>>>>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode, >>>>>>>>> + machine_mode cmp_op_mode) >>>>>>>>> +{ >>>>>>>>> + if (!cached_binops) >>>>>>>>> + cached_binops = hash_table<binop_hasher>::create_ggc (1024); >>>>>>>>> + binop_key key = { code, value_mode, cmp_op_mode }; >>>>>>>>> + hashval_t hash = binop_hasher::hash (code, value_mode, >>>>>>>>> cmp_op_mode); >>>>>>>>> + rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT); >>>>>>>>> + if (!*slot) >>>>>>>>> + *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx >>>>>>>>> (cmp_op_mode), >>>>>>>>> + gen_reg_rtx (cmp_op_mode)); >>>>>>>>> + return *slot; >>>>>>>>> +} >>>>>>>> >>>>>>>> Sorry, I didn't mean anything this complicated. I just meant that >>>>>>>> we should have a single cached rtx that we can change via PUT_CODE and >>>>>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each >>>>>>>> time. >>>>>>>> >>>>>>>> Something like: >>>>>>>> >>>>>>>> static GTY ((cache)) rtx cached_binop; >>>>>>>> >>>>>>>> rtx >>>>>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode >>>>>>>> op_mode) >>>>>>>> { >>>>>>>> if (cached_binop) >>>>>>>> { >>>>>>>> PUT_CODE (cached_binop, code); >>>>>>>> PUT_MODE_RAW (cached_binop, mode); >>>>>>>> PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode); >>>>>>>> PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode); >>>>>>>> } >>>>>>>> else >>>>>>>> { >>>>>>>> rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1); >>>>>>>> rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2); >>>>>>>> cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2); >>>>>>>> } >>>>>>>> return cached_binop; >>>>>>>> } >>>>>>> >>>>>>> Hmm, maybe we need auto_rtx (code) that constructs such >>>>>>> RTX on the stack instead of wasting a GC root (and causing >>>>>>> issues for future threading of GCC ;)). >>>>>> >>>>>> Do you mean something like this? >>>>>> >>>>>> union { >>>>>> char raw[rtx_code_size[code]]; >>>>>> rtx rtx; >>>>>> } binop; >>>>>> >>>>>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show >>>>>> anything useful), or should I implement this? >>>>> >>>>> It doesn't exist AFAIK, I thought about using alloca like >>>>> >>>>> rtx tem; >>>>> rtx_alloca (tem, PLUS); >>>>> >>>>> and due to using alloca rtx_alloca has to be a macro like >>>>> >>>>> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code)); >>>>> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code); >>>>> >>>>> maybe C++ can help making this prettier but of course >>>>> since we use alloca we have to avoid opening new scopes. >>>>> >>>>> I guess templates like with auto_vec doesn't work unless >>>>> we can make RTX_CODE_SIZE constant-evaluated. >>>>> >>>>> Richard. >>>> >>>> I ended up with the following change: >>>> >>>> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c >>>> index a667cdab94e..97aa2144e95 100644 >>>> --- a/gcc/emit-rtl.c >>>> +++ b/gcc/emit-rtl.c >>>> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, >>>> unsigned int regno) >>>> set_regno_raw (x, regno, nregs); >>>> } >>>> >>>> -/* Generate a new REG rtx. Make sure ORIGINAL_REGNO is set properly, and >>>> +/* Initialize a REG rtx. Make sure ORIGINAL_REGNO is set properly, and >>>> don't attempt to share with the various global pieces of rtl (such as >>>> frame_pointer_rtx). */ >>>> >>>> -rtx >>>> -gen_raw_REG (machine_mode mode, unsigned int regno) >>>> +void >>>> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno) >>>> { >>>> - rtx x = rtx_alloc (REG MEM_STAT_INFO); >>>> set_mode_and_regno (x, mode, regno); >>>> REG_ATTRS (x) = NULL; >>>> ORIGINAL_REGNO (x) = regno; >>>> +} >>>> + >>>> +/* Generate a new REG rtx. */ >>>> + >>>> +rtx >>>> +gen_raw_REG (machine_mode mode, unsigned int regno) >>>> +{ >>>> + rtx x = rtx_alloc (REG MEM_STAT_INFO); >>>> + init_raw_REG (x, mode, regno); >>>> return x; >>>> } >>>> >>>> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c >>>> index 5c78fabfb50..bb2087da258 100644 >>>> --- a/gcc/gengenrtl.c >>>> +++ b/gcc/gengenrtl.c >>>> @@ -231,8 +231,7 @@ genmacro (int idx) >>>> puts (")"); >>>> } >>>> >>>> -/* Generate the code for the function to generate RTL whose >>>> - format is FORMAT. */ >>>> +/* Generate the code for functions to generate RTL whose format is >>>> FORMAT. */ >>>> >>>> static void >>>> gendef (const char *format) >>>> @@ -240,22 +239,18 @@ gendef (const char *format) >>>> const char *p; >>>> int i, j; >>>> >>>> - /* Start by writing the definition of the function name and the types >>>> + /* Write the definition of the init function name and the types >>>> of the arguments. */ >>>> >>>> - printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, >>>> machine_mode mode", format); >>>> + puts ("static inline void"); >>>> + printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format); >>>> for (p = format, i = 0; *p != 0; p++) >>>> if (*p != '0') >>>> printf (",\n\t%sarg%d", type_from_format (*p), i++); >>>> + puts (")"); >>>> >>>> - puts (" MEM_STAT_DECL)"); >>>> - >>>> - /* Now write out the body of the function itself, which allocates >>>> - the memory and initializes it. */ >>>> + /* Now write out the body of the init function itself. */ >>>> puts ("{"); >>>> - puts (" rtx rt;"); >>>> - puts (" rt = rtx_alloc (code PASS_MEM_STAT);\n"); >>>> - >>>> puts (" PUT_MODE_RAW (rt, mode);"); >>>> >>>> for (p = format, i = j = 0; *p ; ++p, ++i) >>>> @@ -266,16 +261,56 @@ gendef (const char *format) >>>> else >>>> printf (" %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, >>>> j++); >>>> >>>> - puts ("\n return rt;\n}\n"); >>>> + puts ("}\n"); >>>> + >>>> + /* Write the definition of the gen function name and the types >>>> + of the arguments. */ >>>> + >>>> + puts ("static inline rtx"); >>>> + printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", >>>> format); >>>> + for (p = format, i = 0; *p != 0; p++) >>>> + if (*p != '0') >>>> + printf (",\n\t%sarg%d", type_from_format (*p), i++); >>>> + puts (" MEM_STAT_DECL)"); >>>> + >>>> + /* Now write out the body of the function itself, which allocates >>>> + the memory and initializes it. */ >>>> + puts ("{"); >>>> + puts (" rtx rt;\n"); >>>> + >>>> + puts (" rt = rtx_alloc (code PASS_MEM_STAT);"); >>>> + printf (" init_rtx_fmt_%s (rt, mode", format); >>>> + for (p = format, i = 0; *p != 0; p++) >>>> + if (*p != '0') >>>> + printf (", arg%d", i++); >>>> + puts (");\n"); >>>> + >>>> + puts (" return rt;\n}\n"); >>>> + >>>> + /* Write the definition of gen macro. */ >>>> + >>>> printf ("#define gen_rtx_fmt_%s(c, m", format); >>>> for (p = format, i = 0; *p != 0; p++) >>>> if (*p != '0') >>>> - printf (", p%i",i++); >>>> - printf (")\\\n gen_rtx_fmt_%s_stat (c, m", format); >>>> + printf (", arg%d", i++); >>>> + printf (") \\\n gen_rtx_fmt_%s_stat ((c), (m)", format); >>>> for (p = format, i = 0; *p != 0; p++) >>>> if (*p != '0') >>>> - printf (", p%i",i++); >>>> + printf (", (arg%d)", i++); >>>> printf (" MEM_STAT_INFO)\n\n"); >>>> + >>>> + /* Write the definition of alloca macro. */ >>>> + >>>> + printf ("#define alloca_rtx_fmt_%s(rt, c, m", format); >>>> + for (p = format, i = 0; *p != 0; p++) >>>> + if (*p != '0') >>>> + printf (", arg%d", i++); >>>> + printf (") \\\n rtx_alloca ((rt), (c)); \\\n"); >>>> + printf (" init_rtx_fmt_%s ((rt), (m)", format); >>>> + for (p = format, i = 0; *p != 0; p++) >>>> + if (*p != '0') >>>> + printf (", (arg%d)", i++); >>>> + printf (")\n\n"); >>>> } >>>> >>>> /* Generate the documentation header for files we write. */ >>>> diff --git a/gcc/rtl.h b/gcc/rtl.h >>>> index efb9b3ce40d..44733d8a39e 100644 >>>> --- a/gcc/rtl.h >>>> +++ b/gcc/rtl.h >>>> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void); >>>> >>>> /* In rtl.c */ >>>> extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO); >>>> +#define rtx_alloca(rt, code) \ >>>> + (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \ >>>> + memset ((rt), 0, RTX_HDR_SIZE); \ >>>> + PUT_CODE ((rt), (code)); >>>> extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int); >>>> #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ) >>>> #define const_wide_int_alloc(NWORDS) \ >>>> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn >>>> *prev_insn, rtx_insn *next_insn, >>>> extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT); >>>> extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec); >>>> extern void set_mode_and_regno (rtx, machine_mode, unsigned int); >>>> +extern void init_raw_REG (rtx, machine_mode, unsigned int); >>>> extern rtx gen_raw_REG (machine_mode, unsigned int); >>>> +#define alloca_raw_REG(rt, mode, regno) \ >>>> + rtx_alloca ((rt), REG); \ >>>> + init_raw_REG ((rt), (mode), (regno)) >>>> extern rtx gen_rtx_REG (machine_mode, unsigned int); >>>> extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64); >>>> extern rtx gen_rtx_MEM (machine_mode, rtx); >>>> >>>> which now allows me to write: >>>> >>>> rtx reg1, reg2, test; >>>> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1); >>>> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2); >>>> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2); >>>> >>>> If that looks ok, I'll resend the series. >>> >>> that looks OK to me - please leave Richard S. time to comment. Also while >>> I'd like to see >>> >>> rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1); >>> >>> I don't really see a way to write that portably (or at all), do you all >>> agree? >>> GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore >>> nor place CLOBBERs to end their lifetime. But is it guaranteed that the >>> alloca result is valid until frame termination? >> >> Hmm, the alloca man page says: >> >> The alloca() function allocates size bytes of space in the stack >> frame of the caller. This temporary space is automatically freed >> when the function that called alloca() returns to its caller. >> ... >> The space allocated by alloca() is not automatically deallocated if >> the pointer that refers to it simply goes out of scope. >> >> A quick experiment with gcc and clang confirms this. I think this means >> I can make alloca_raw_REG macro return the allocated pointer using the >> return-from-block GNU extension. > > What do you think about the following approach? > > extern rtx rtx_init (rtx, RTX_CODE); > #define rtx_alloca(code) \ > rtx_init ((rtx) alloca (RTX_CODE_SIZE ((code))), (code)) > > ... > > rtx > rtx_init (rtx rt, RTX_CODE code) > { > /* We want to clear everything up to the FLD array. Normally, this > is one int, but we don't want to assume that and it isn't very > portable anyway; this is. */ > memset (rt, 0, RTX_HDR_SIZE);
Might as well remove this comment while you're there. RTX_HDR_SIZE has long ceased to be sizeof (int) for any host. > PUT_CODE (rt, code); > return rt; > } > > with similar changes to alloca_raw_REG and gengenrtl. This way we don't > even need a GNU extension. Sounds good to me, but rtx_init should probably be an inline function. Thanks, Richard