Hi Richard,
  It's a good point. I will test it. Thanks a lot.

Thanks
Gui Haochen

在 2024/8/15 17:50, Richard Sandiford 写道:
> HAO CHEN GUI <guih...@linux.ibm.com> writes:
>> Hi,
>>   This patch adds const0 move checking for CLEAR_BY_PIECES. The original
>> vec_duplicate handles duplicates of non-constant inputs. But 0 is a
>> constant. So even a platform doesn't support vec_duplicate, it could
>> still do clear by pieces if it supports const0 move by that mode.
>>
>>   Compared to the previous version, the main changes are to create a
>> new class for clear by pieces and add an additional argument to
>> indicate if the object is constant in pieces_addr.
>> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658337.html
> 
> Rather than add the additional argument, could we instead provide a
> constfn that always returns zero?  ISTM that, under the current pieces_addr
> framework, clear by pieces is essentially a memcpy from arbitrarily many
> zeros.  E.g.:
> 
>   clear_by_pieces_d (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
>     : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true,
>                     read_zero, NULL, len, align, false, CLEAR_BY_PIECES)
> 
> where read_zero is something like:
> 
> static rtx
> read_zero (void *, void *, HOST_WIDE_INT, fixed_size_mode mode)
> {
>   return CONST0_RTX (mode);
> }
> 
> (completely untested).
> 
> The changes to by_pieces_mode_supported_p look good.
> 
> Thanks,
> Richard
> 
>>   I didn't convert const0 move predicate check to an assertion as it
>> caused ICEs on i386. On i386, some modes (V8QI V4HI V2SI V1DI) have
>> move expand defined but their predicates don't include const0.
>>
>>   Bootstrapped and tested on powerpc64-linux BE and LE with no
>> regressions.
>>
>>   On i386, it got several regressions. One issue is the predicate of
>> V16QI move expand doesn't include const0. Thus V16QI mode can't be used
>> for clear by pieces with the patch. The second issue is the const0 is
>> passed directly to the move expand with the patch. Originally it is
>> forced to a pseudo and i386 can leverage the previous data to do
>> optimization.
>>
>>   The patch also raises several regressions on aarch64. The V2x8QImode
>> replaces TImode to do 16-byte clear by pieces as V2x8QImode move expand
>> supports const0 and vector mode is preferable. I drafted a patch to
>> address the issue. It will be sent for review in a separate email.
>> Another problem is V8QImode replaces DImode to do 8-byte clear by pieces.
>> It seems cause different sequences of instructions but the actually
>> instructions are the same.
>>
>>
>> ChangeLog
>> expand: Add const0 move checking for CLEAR_BY_PIECES optabs
>>
>> vec_duplicate handles duplicates of non-constant inputs.  The 0 is a
>> constant.  So even a platform doesn't support vec_duplicate, it could
>> still do clear by pieces if it supports const0 move.  This patch adds
>> the checking.
>>
>> gcc/
>>      * expr.cc (by_pieces_mode_supported_p): Add const0 move checking
>>      for CLEAR_BY_PIECES.
>>      (pieces_addr::pieces_addr): Add fifth argument is_const to
>>      indicate if object is a constant.  Do not set m_addr_inc if object
>>      is a constant.
>>      (op_by_pieces_d::op_by_pieces_d): Initialize m_from by setting
>>      is_const to true for CLEAR_BY_PIECES.
>>      (class clear_by_pieces_d): New.
>>      (clear_by_pieces_d::prepare_mode): New.
>>      (clear_by_pieces_d::generate): New.
>>      (clear_by_pieces): Replace store_by_pieces_d with clear_by_pieces_d.
>>
>> patch.diff
>> diff --git a/gcc/expr.cc b/gcc/expr.cc
>> index 9f66d479445..abf69c8d698 100644
>> --- a/gcc/expr.cc
>> +++ b/gcc/expr.cc
>> @@ -1014,14 +1014,20 @@ can_use_qi_vectors (by_pieces_operation op)
>>  static bool
>>  by_pieces_mode_supported_p (fixed_size_mode mode, by_pieces_operation op)
>>  {
>> -  if (optab_handler (mov_optab, mode) == CODE_FOR_nothing)
>> +  enum insn_code icode = optab_handler (mov_optab, mode);
>> +  if (icode == CODE_FOR_nothing)
>>      return false;
>>
>> -  if ((op == SET_BY_PIECES || op == CLEAR_BY_PIECES)
>> +  if (op == SET_BY_PIECES
>>        && VECTOR_MODE_P (mode)
>>        && optab_handler (vec_duplicate_optab, mode) == CODE_FOR_nothing)
>>      return false;
>>
>> +  if (op == CLEAR_BY_PIECES
>> +      && VECTOR_MODE_P (mode)
>> +      && !insn_operand_matches (icode, 1, CONST0_RTX (mode)))
>> +   return false;
>> +
>>    if (op == COMPARE_BY_PIECES
>>        && !can_compare_p (EQ, mode, ccp_jump))
>>      return false;
>> @@ -1184,7 +1190,7 @@ class pieces_addr
>>    by_pieces_constfn m_constfn;
>>    void *m_cfndata;
>>  public:
>> -  pieces_addr (rtx, bool, by_pieces_constfn, void *);
>> +  pieces_addr (rtx, bool, by_pieces_constfn, void *, bool = false);
>>    rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
>>    void increment_address (HOST_WIDE_INT);
>>    void maybe_predec (HOST_WIDE_INT);
>> @@ -1204,7 +1210,7 @@ public:
>>     memory load.  */
>>
>>  pieces_addr::pieces_addr (rtx obj, bool is_load, by_pieces_constfn constfn,
>> -                      void *cfndata)
>> +                      void *cfndata, bool is_const)
>>    : m_obj (obj), m_is_load (is_load), m_constfn (constfn), m_cfndata 
>> (cfndata)
>>  {
>>    m_addr_inc = 0;
>> @@ -1228,7 +1234,9 @@ pieces_addr::pieces_addr (rtx obj, bool is_load, 
>> by_pieces_constfn constfn,
>>    else
>>      {
>>        m_addr = NULL_RTX;
>> -      if (!is_load)
>> +      if (is_const)
>> +    ;
>> +      else if (!is_load)
>>      {
>>        m_auto = true;
>>        if (STACK_GROWS_DOWNWARD)
>> @@ -1391,7 +1399,7 @@ op_by_pieces_d::op_by_pieces_d (unsigned int 
>> max_pieces, rtx to,
>>                              unsigned int align, bool push,
>>                              by_pieces_operation op)
>>    : m_to (to, to_load, NULL, NULL),
>> -    m_from (from, from_load, from_cfn, from_cfn_data),
>> +    m_from (from, from_load, from_cfn, from_cfn_data, op == 
>> CLEAR_BY_PIECES),
>>      m_len (len), m_max_size (max_pieces + 1),
>>      m_push (push), m_op (op)
>>  {
>> @@ -1724,6 +1732,48 @@ store_by_pieces_d::finish_retmode (memop_ret retmode)
>>    return m_to.adjust (QImode, m_offset);
>>  }
>>
>> +/* Derived class from op_by_pieces_d, providing support for block clear
>> +   operations.  */
>> +
>> +class clear_by_pieces_d : public op_by_pieces_d
>> +{
>> +  insn_gen_fn m_gen_fun;
>> +
>> +  void generate (rtx, rtx, machine_mode) final override;
>> +  bool prepare_mode (machine_mode, unsigned int) final override;
>> +
>> + public:
>> +  clear_by_pieces_d (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
>> +    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, false, NULL,
>> +                  NULL, len, align, false, CLEAR_BY_PIECES)
>> +  {
>> +  }
>> +};
>> +
>> +/* Return true if MODE can be used for a set of stores, given an
>> +   alignment ALIGN.  Prepare whatever data is necessary for later
>> +   calls to generate.  */
>> +
>> +bool
>> +clear_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
>> +{
>> +  insn_code icode = optab_handler (mov_optab, mode);
>> +  m_gen_fun = GEN_FCN (icode);
>> +  return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode);
>> +}
>> +
>> +/* A callback used when iterating for a clear_by_pieces_operation.
>> +   The input op1 should be NULL for clear_by_pieces and set it to
>> +   const0 of that mode.  */
>> +
>> +void
>> +clear_by_pieces_d::generate (rtx op0, rtx op1, machine_mode mode)
>> +{
>> +  gcc_assert (!op1);
>> +  op1 = CONST0_RTX (mode);
>> +  emit_insn (m_gen_fun (op0, op1));
>> +}
>> +
>>  /* Determine whether the LEN bytes generated by CONSTFUN can be
>>     stored to memory using several move instructions.  CONSTFUNDATA is
>>     a pointer which will be passed as argument in every CONSTFUN call.
>> @@ -1849,10 +1899,7 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, 
>> unsigned int align)
>>    if (len == 0)
>>      return;
>>
>> -  /* Use builtin_memset_read_str to support vector mode broadcast.  */
>> -  char c = 0;
>> -  store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align,
>> -                      CLEAR_BY_PIECES);
>> +  clear_by_pieces_d data (to, len, align);
>>    data.run ();
>>  }

Reply via email to