I sent the new version and added you as a co-author.

Thanks,
Mariam

On Fri, Aug 16, 2024 at 7:25 PM Mariam Arutunian <mariamarutun...@gmail.com>
wrote:

>
>
> On Fri, Aug 9, 2024 at 7:22 PM Richard Sandiford <
> richard.sandif...@arm.com> wrote:
>
>> Sorry again for the slow review. :(
>>
>> I only really looked at the unreversed version earlier, on the basis
>> that the comments would apply to both versions.  But I've got a couple
>> of comments about the reversed version below:
>>
>> Mariam Arutunian <mariamarutun...@gmail.com> writes:
>> > [...]
>> > diff --git a/gcc/config/aarch64/aarch64.cc
>> b/gcc/config/aarch64/aarch64.cc
>> > index ee12d8897a8..546a379fd74 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -30265,6 +30265,126 @@ aarch64_retrieve_sysreg (const char *regname,
>> bool write_p, bool is128op)
>> >    return sysreg->encoding;
>> >  }
>> >
>> > +/* Generate assembly to calculate CRC
>> > +   using carry-less multiplication instruction.
>> > +   OPERANDS[1] is input CRC,
>> > +   OPERANDS[2] is data (message),
>> > +   OPERANDS[3] is the polynomial without the leading 1.  */
>> > +
>> > +void
>> > +aarch64_expand_crc_using_pmull (scalar_mode crc_mode,
>> > +                             scalar_mode data_mode,
>> > +                             rtx *operands)
>> > +{
>> > +  /* Check and keep arguments.  */
>> > +  gcc_assert (!CONST_INT_P (operands[0]));
>> > +  gcc_assert (CONST_INT_P (operands[3]));
>> > +  rtx crc = operands[1];
>> > +  rtx data = operands[2];
>> > +  rtx polynomial = operands[3];
>> > +
>> > +  unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
>> > +  unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
>> > +  gcc_assert (crc_size <= 32);
>> > +  gcc_assert (data_size <= crc_size);
>> > +
>> > +  /* Calculate the quotient.  */
>> > +  unsigned HOST_WIDE_INT
>> > +      q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
>> > +  /* CRC calculation's main part.  */
>> > +  if (crc_size > data_size)
>> > +    crc = expand_shift (RSHIFT_EXPR, DImode, crc, crc_size - data_size,
>> > +                     NULL_RTX, 1);
>> > +
>> > +  rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
>> > +  polynomial = simplify_gen_unary (ZERO_EXTEND, DImode, polynomial,
>> > +                                GET_MODE (polynomial));
>> > +  rtx t1 = force_reg (DImode, polynomial);
>> > +
>> > +  rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
>> > +                      OPTAB_WIDEN);
>> > +
>> > +  rtx clmul_res = gen_reg_rtx (TImode);
>> > +  emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t0));
>> > +  a0 = gen_lowpart (DImode, clmul_res);
>> > +
>> > +  a0 = expand_shift (RSHIFT_EXPR, DImode, a0, crc_size, NULL_RTX, 1);
>> > +
>> > +  emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t1));
>> > +  a0 = gen_lowpart (DImode, clmul_res);
>> > +
>> > +  if (crc_size > data_size)
>> > +    {
>> > +      rtx crc_part = expand_shift (LSHIFT_EXPR, DImode, operands[1],
>> data_size,
>> > +                                NULL_RTX, 0);
>> > +      a0 = expand_binop (DImode, xor_optab, a0, crc_part, NULL_RTX, 1,
>> > +                      OPTAB_DIRECT);
>> > +    }
>> > +
>> > +  /* Zero upper bits beyond crc_size.  */
>>
>> The comment no longer applies.  Otherwise this function looks good to me.
>>
>>
> Ok.)
>
>
>> > +  aarch64_emit_move (operands[0], gen_lowpart (crc_mode, a0));
>> > +}
>> > +
>> > +/* Generate assembly to calculate reversed CRC
>> > +   using carry-less multiplication instruction.
>> > +   OPERANDS[1] is input CRC,
>> > +   OPERANDS[2] is data,
>> > +   OPERANDS[3] is the polynomial without the leading 1.  */
>> > +
>> > +void
>> > +aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode,
>> > +                                      scalar_mode data_mode,
>> > +                                      rtx *operands)
>> > +{
>> > +  /* Check and keep arguments.  */
>> > +  gcc_assert (!CONST_INT_P (operands[0]));
>> > +  gcc_assert (CONST_INT_P (operands[3]));
>> > +  rtx crc = operands[1];
>> > +  rtx data = operands[2];
>> > +  rtx polynomial = operands[3];
>> > +
>> > +  unsigned HOST_WIDE_INT crc_size = GET_MODE_BITSIZE (crc_mode);
>> > +  unsigned HOST_WIDE_INT data_size = GET_MODE_BITSIZE (data_mode);
>> > +  gcc_assert (crc_size <= 32);
>> > +  gcc_assert (data_size <= crc_size);
>> > +
>> > +  /* Calculate the quotient.  */
>> > +  unsigned HOST_WIDE_INT
>> > +      q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
>> > +  /* Reflect the calculated quotient.  */
>> > +  q = reflect (q);
>> > +  rtx t0 = force_reg (DImode, gen_int_mode (q >> (data_size - 4),
>> DImode));
>> > +
>> > +  /* Reflect the polynomial.  */
>> > +  unsigned HOST_WIDE_INT ref_polynomial = reflect (UINTVAL
>> (polynomial));
>>
>> It looks like reflect() autodetects the bitwidth based on the assumption
>> that the upper half will be nonzero.  But that might not be true for all
>> possible polynomials (when the implicit leading coefficient is absent)
>> E.g. it looks like the 64-bit HDLC CRC polynomial is 0x1b (just the
>> lowest byte nonzero), and although we don't support 64-bit polynomials
>> here, the approach wouldn't work for it.
>>
>> I think it'd be safer to pass the bitwidth as an explicit parameter.
>> Also, maybe it could go in hwint.* instead of expr.* and be called
>> something like reflect_hwi.
>>
>
> Yes. In the CRC loop recognition part, I only support those polynomials
> whose upper half is nonzero. I.e required memories for the polynomial and
> the CRC are equal. The reason for this is that, as the polynomial's
> leading bit is emitted, it is not possible to precisely assume what CRC is
> being calculated (e.g., 64, 32, etc.). Because there are some
> implementations, where more memory than needed is used to store the CRC.
> For example, in some implementations, a 64-bit memory may be used to
> calculate a 32-bit CRC.
> But, I agree, it's safer to add an explicit parameter.
> I'll make the suggested changes.
>
>
>> > +  rtx t1 = force_reg (DImode, gen_int_mode (ref_polynomial << 1,
>> DImode));
>> > +
>> > +  /* CRC calculation's main part.  */
>> > +  rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
>> > +                      OPTAB_WIDEN);
>> > +
>> > +  /* Perform carry-less multiplication and get low part.  */
>> > +  rtx clmul_res = gen_reg_rtx (TImode);
>> > +  emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t0));
>> > +  a0 = gen_lowpart (DImode, clmul_res);
>> > +
>> > +  a0 = expand_shift (LSHIFT_EXPR, DImode, a0, 64 - crc_size - 3,
>> NULL_RTX, 0);
>>
>> I haven't really looked at this implementation strategy before, so this
>> is probably a silly question sorry :-) but is there a specific reason
>> for selecting the bias of 4 in the right shift above, cancelled out by
>> subtracting the same amount from this left shift?  It looks like we could
>> use any value in the range [1. crc_size - 1], is that right?
>>
>> Just asking out of curiosity though.  I agree it works.
>>
>>
> Sorry, I don’t remember exactly why I used a bias of 4. Upon reviewing it,
> I think it's related to the reflection process, where zeros are added to
> the quotient.
> For example, in the case of CRC-8, the quotient is 9 bits but is stored in
> a 16-bit value, so 7 bits are zeros.
> It might be better to use q >> (crc_size - 1) and then do the left shift
> with 64 - data_size instead of 64 - crc_size - 3.
> However, I noticed that you provided a better solution. Thank you very
> much.)
>
> > +
>> > +  /* Perform carry-less multiplication and get high part.  */
>> > +  emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t1));
>> > +  a0 = gen_highpart (DImode, clmul_res);
>>
>> Although this works, it's taking a DImode highpart of a single TImode
>> register, which requires a spill from the register allocator.  So we get
>> something like:
>>
>>         pmull   v31.1q, v31.1d, v14.1d
>>         str     q31, [sp, 48]
>>         ldr     d31, [sp, 48]
>>         shl     d31, d31, 53
>>         pmull   v31.1q, v31.1d, v15.1d
>>         str     q31, [sp, 48]
>>         ldrb    w0, [sp, 56]
>>
>> (I think the first STR and LDR come from reusing the same pseudo register
>> for the temporary results; it's the STR+LDRB that causes the spill.)
>>
>> One way around that would be:
>>
>>   emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t1));
>>   a0 = gen_reg_rtx (DImode);
>>   rtx v2di_res = gen_lowpart (V2DImode, clmul_res);
>>   rtx high_index = BYTES_BIG_ENDIAN ? const0_rtx : const1_rtx;
>>   emit_insn (gen_aarch64_get_lanev2di (a0, v2di_res, high_index));
>>
>> That generates:
>>
>>         pmull   v30.1q, v29.1d, v30.1d
>>         shl     d30, d30, 53
>>         pmull   v30.1q, v30.1d, v31.1d
>>         umov    x0, v30.d[1]
>>
>> which is pretty nice.
>>
>
> There again, it might not be too bad if we ionstead used the flipped
>> version of the unreversed approach, such as the following minor adaption:
>>
>>   /* Calculate the quotient.  */
>>   unsigned HOST_WIDE_INT
>>       q = gf2n_poly_long_div_quotient (UINTVAL (polynomial), crc_size);
>>   /* Reflect the calculated quotient.  */
>>   q = reflect_hwi (q, crc_size + 1);
>>   rtx t0 = force_reg (DImode, gen_int_mode (q, DImode));
>>
>>   /* Reflect the polynomial.  */
>>   unsigned HOST_WIDE_INT ref_polynomial = reflect_hwi (UINTVAL
>> (polynomial),
>>                                                        crc_size);
>>   /* An unshifted multiplier would require the final result to be
>> extracted
>>      using a shift right by DATA_SIZE - 1 bits.  Shift the multiplier left
>>      so that the shift right can be by CRC_SIZE bits instead.  */
>>   ref_polynomial <<= crc_size - data_size + 1;
>>   rtx t1 = force_reg (DImode, gen_int_mode (ref_polynomial, DImode));
>>
>>   /* CRC calculation's main part.  */
>>   rtx a0 = expand_binop (DImode, xor_optab, crc, data, NULL_RTX, 1,
>>                          OPTAB_WIDEN);
>>
>>   /* Perform carry-less multiplication and get low part.  */
>>   rtx clmul_res = gen_reg_rtx (TImode);
>>   emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t0));
>>   a0 = gen_lowpart (DImode, clmul_res);
>>
>>   a0 = expand_binop (DImode, and_optab, a0,
>>                      gen_int_mode (GET_MODE_MASK (data_mode), DImode),
>>                      NULL_RTX, 1, OPTAB_WIDEN);
>>
>>   /* Perform carry-less multiplication.  */
>>   emit_insn (gen_aarch64_crypto_pmulldi (clmul_res, a0, t1));
>>
>>   /* Perform a shift right by CRC_SIZE as an extraction of lane 1.  */
>>   machine_mode crc_vmode = aarch64_vq_mode (crc_mode).require ();
>>   a0 = (crc_size > data_size ? gen_reg_rtx (crc_mode) : operands[0]);
>>   emit_insn (gen_aarch64_get_lane (crc_vmode, a0,
>>                                    gen_lowpart (crc_vmode, clmul_res),
>>                                    aarch64_endian_lane_rtx (crc_vmode,
>> 1)));
>>
>>   if (crc_size > data_size)
>>     {
>>       rtx crc_part = expand_shift (RSHIFT_EXPR, crc_mode, crc, data_size,
>>                                    NULL_RTX, 1);
>>       a0 = expand_binop (crc_mode, xor_optab, a0, crc_part, operands[0],
>> 1,
>>                          OPTAB_WIDEN);
>>       aarch64_emit_move (operands[0], a0);
>>     }
>>
>> This gives:
>>
>>         pmull   v30.1q, v30.1d, v31.1d
>>         movi    v31.2d, 0xff
>>         and     v30.8b, v30.8b, v31.8b
>>         pmull   v30.1q, v30.1d, v31.1d
>>         umov    w0, v30.b[1]
>>
>> or, with SVE enabled:
>>
>>         pmull   v30.1q, v31.1d, v30.1d
>>         and     z30.d, z30.d, #255
>>         pmull   v30.1q, v30.1d, v31.1d
>>         umov    w0, v30.b[1]
>>
>> This is preferable since ANDs are generally cheaper than shifts.
>>
>> That's just a suggestion though; the original version is ok too.
>>
>>
> Thanks for the suggestions. I don't have much experience with gcc, so this
> is a huge help for me.
> I'll apply all the changes and send the new version.
>
>
>> Minor comment: it'd be good to use "pmull_res" instead of "clmul_res"
>> for the variables: :)
>>
>>
> Ok)
>
>
>> LGTM otherwise.
>>
>>
> Thanks,
> Mariam
>
>
>> Thanks,
>> Richard
>>
>> > +
>> > +  if (crc_size > data_size)
>> > +    {
>> > +      rtx crc_part = expand_shift (RSHIFT_EXPR, DImode, crc, data_size,
>> > +                                NULL_RTX, 1);
>> > +      a0 = expand_binop (DImode, xor_optab, a0, crc_part, NULL_RTX, 1,
>> > +                      OPTAB_DIRECT);
>> > +    }
>> > +
>> > +  aarch64_emit_move (operands[0], gen_lowpart (crc_mode, a0));
>> > +}
>> > +
>> >  /* Target-specific selftests.  */
>> >
>> >  #if CHECKING_P
>> > diff --git a/gcc/config/aarch64/aarch64.md
>> b/gcc/config/aarch64/aarch64.md
>> > index 9dff2d7a2b0..08c588bc475 100644
>> > --- a/gcc/config/aarch64/aarch64.md
>> > +++ b/gcc/config/aarch64/aarch64.md
>> > @@ -4543,6 +4543,63 @@
>> >    [(set_attr "type" "crc")]
>> >  )
>> >
>> > +;; Reversed CRC
>> > +(define_expand "crc_rev<ALLI:mode><ALLX:mode>4"
>> > +  [;; return value (calculated CRC)
>> > +   (match_operand:ALLX 0 "register_operand" "=r")
>> > +   ;; initial CRC
>> > +   (match_operand:ALLX 1 "register_operand" "r")
>> > +   ;; data
>> > +   (match_operand:ALLI 2 "register_operand" "r")
>> > +   ;; polynomial without leading 1
>> > +   (match_operand:ALLX 3)]
>> > +  ""
>> > +  {
>> > +    /* If the polynomial is the same as the polynomial of crc32c*
>> instruction,
>> > +       put that instruction.  crc32c uses iSCSI polynomial.  */
>> > +    if (TARGET_CRC32 && INTVAL (operands[3]) == 0x1EDC6F41
>> > +     && <ALLX:MODE>mode == SImode)
>> > +      emit_insn (gen_aarch64_crc32c<ALLI:crc_data_type> (operands[0],
>> > +                                                      operands[1],
>> > +                                                      operands[2]));
>> > +    /* If the polynomial is the same as the polynomial of crc32*
>> instruction,
>> > +     put that instruction.  crc32 uses HDLC etc.  polynomial.  */
>> > +    else if (TARGET_CRC32 && INTVAL (operands[3]) == 0x04C11DB7
>> > +          && <ALLX:MODE>mode == SImode)
>> > +      emit_insn (gen_aarch64_crc32<ALLI:crc_data_type> (operands[0],
>> > +                                                     operands[1],
>> > +                                                     operands[2]));
>> > +    else if (TARGET_AES && <ALLI:sizen> <= <ALLX:sizen>)
>> > +      aarch64_expand_reversed_crc_using_pmull (<ALLX:MODE>mode,
>> > +                                            <ALLI:MODE>mode,
>> > +                                            operands);
>> > +    else
>> > +      /* Otherwise, generate table-based CRC.  */
>> > +      expand_reversed_crc_table_based (operands[0], operands[1],
>> operands[2],
>> > +                                    operands[3], <ALLI:MODE>mode,
>> > +                                    generate_reflecting_code_standard);
>> > +    DONE;
>> > +  }
>> > +)
>> > +
>> > +;; Bit-forward CRC
>> > +(define_expand "crc<ALLI:mode><ALLX:mode>4"
>> > +  [;; return value (calculated CRC)
>> > +   (match_operand:ALLX 0 "register_operand" "=r")
>> > +   ;; initial CRC
>> > +   (match_operand:ALLX 1 "register_operand" "r")
>> > +   ;; data
>> > +   (match_operand:ALLI 2 "register_operand" "r")
>> > +   ;; polynomial without leading 1
>> > +   (match_operand:ALLX 3)]
>> > +  "TARGET_AES && <ALLI:sizen> <= <ALLX:sizen>"
>> > +  {
>> > +    aarch64_expand_crc_using_pmull (<ALLX:MODE>mode, <ALLI:MODE>mode,
>> > +                                 operands);
>> > +    DONE;
>> > +  }
>> > +)
>> > +
>> >  (define_insn "*csinc2<mode>_insn"
>> >    [(set (match_operand:GPI 0 "register_operand" "=r")
>> >          (plus:GPI (match_operand 2 "aarch64_comparison_operation" "")
>> > diff --git a/gcc/config/aarch64/iterators.md
>> b/gcc/config/aarch64/iterators.md
>> > index 99cde46f1ba..86e4863d684 100644
>> > --- a/gcc/config/aarch64/iterators.md
>> > +++ b/gcc/config/aarch64/iterators.md
>> > @@ -1276,6 +1276,10 @@
>> >  ;; Map a mode to a specific constraint character.
>> >  (define_mode_attr cmode [(QI "q") (HI "h") (SI "s") (DI "d")])
>> >
>> > +;; Map a mode to a specific constraint character for calling
>> > +;; appropriate version of crc.
>> > +(define_mode_attr crc_data_type [(QI "b") (HI "h") (SI "w") (DI "x")])
>> > +
>> >  ;; Map modes to Usg and Usj constraints for SISD right shifts
>> >  (define_mode_attr cmode_simd [(SI "g") (DI "j")])
>> >
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-1-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-1-pmul.c
>> > new file mode 100644
>> > index 00000000000..4043251dbd8
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-1-pmul.c
>> > @@ -0,0 +1,8 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc -fdisable-tree-phiopt2 -fdisable-tree-phiopt3" } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-1.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-10-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-10-pmul.c
>> > new file mode 100644
>> > index 00000000000..dd866b38e83
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-10-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-10.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-12-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-12-pmul.c
>> > new file mode 100644
>> > index 00000000000..16d901eeaef
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-12-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc -fdisable-tree-phiopt2 -fdisable-tree-phiopt3" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-12.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-13-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-13-pmul.c
>> > new file mode 100644
>> > index 00000000000..5f7741fad0f
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-13-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-13.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-14-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-14-pmul.c
>> > new file mode 100644
>> > index 00000000000..cdedbbd3db1
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-14-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-14.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-17-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-17-pmul.c
>> > new file mode 100644
>> > index 00000000000..c219e49a2b1
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-17-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-17.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-18-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-18-pmul.c
>> > new file mode 100644
>> > index 00000000000..124900a979b
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-18-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-18.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-21-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-21-pmul.c
>> > new file mode 100644
>> > index 00000000000..3cae1a7f57b
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-21-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-21.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-22-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-22-pmul.c
>> > new file mode 100644
>> > index 00000000000..0ec2e312f8f
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-22-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-22.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-23-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-23-pmul.c
>> > new file mode 100644
>> > index 00000000000..0c4542adb40
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-23-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-23.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-4-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-4-pmul.c
>> > new file mode 100644
>> > index 00000000000..08f1d3b69d7
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-4-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-4.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-5-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-5-pmul.c
>> > new file mode 100644
>> > index 00000000000..91bf5e6353d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-5-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -w -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-5.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-6-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-6-pmul.c
>> > new file mode 100644
>> > index 00000000000..4680eafe758
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-6-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-6.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-7-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-7-pmul.c
>> > new file mode 100644
>> > index 00000000000..655484d10d4
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-7-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-7.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-8-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-8-pmul.c
>> > new file mode 100644
>> > index 00000000000..6c2acc84c32
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-8-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-8.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-9-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-9-pmul.c
>> > new file mode 100644
>> > index 00000000000..e76f3c77b59
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-9-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-9.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data16-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data16-pmul.c
>> > new file mode 100644
>> > index 00000000000..21520474564
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data16-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-w -march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-CCIT-data16.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data8-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data8-pmul.c
>> > new file mode 100644
>> > index 00000000000..3dcc92320f3
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-CCIT-data8-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-w -march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto" } } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-CCIT-data8.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git
>> a/gcc/testsuite/gcc.target/aarch64/crc-coremark-16bitdata-pmul.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-coremark-16bitdata-pmul.c
>> > new file mode 100644
>> > index 00000000000..e5196aaafef
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-coremark-16bitdata-pmul.c
>> > @@ -0,0 +1,9 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-w -march=armv8-a+crypto -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include "../../gcc.dg/torture/crc-coremark16-data16.c"
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "pmull" "dfinish"} } */
>> > \ No newline at end of file
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32-data16.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data16.c
>> > new file mode 100644
>> > index 00000000000..e82cb04fcc3
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data16.c
>> > @@ -0,0 +1,53 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint16_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint16_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint16_t i = 0; i < 0xffff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32-data32.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data32.c
>> > new file mode 100644
>> > index 00000000000..a7564a7e28a
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data32.c
>> > @@ -0,0 +1,52 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint32_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 32; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint32_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 32; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint8_t i = 0; i < 0xff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32-data8.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data8.c
>> > new file mode 100644
>> > index 00000000000..c88cafadedc
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32-data8.c
>> > @@ -0,0 +1,53 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint8_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint8_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0xEDB88320;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint8_t i = 0; i < 0xff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data16.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data16.c
>> > new file mode 100644
>> > index 00000000000..d82e6252603
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data16.c
>> > @@ -0,0 +1,53 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint16_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint16_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint16_t i = 0; i < 0xffff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32C" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data32.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data32.c
>> > new file mode 100644
>> > index 00000000000..7acb6fc239c
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data32.c
>> > @@ -0,0 +1,52 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint32_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 32; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint32_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 32; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint8_t i = 0; i < 0xff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32C" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data8.c
>> b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data8.c
>> > new file mode 100644
>> > index 00000000000..e8a8901e453
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/crc-crc32c-data8.c
>> > @@ -0,0 +1,53 @@
>> > +/* { dg-do run } */
>> > +/* { dg-options "-march=armv8-a+crc -O2 -fdump-rtl-dfinish
>> -fdump-tree-crc" } */
>> > +/* { dg-skip-if "" { *-*-* } { "-flto"} } */
>> > +
>> > +#include <stdint.h>
>> > +#include <stdlib.h>
>> > +
>> > +__attribute__ ((noinline,optimize(0)))
>> > +uint32_t _crc32_O0 (uint32_t crc, uint8_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +uint32_t _crc32 (uint32_t crc, uint8_t data) {
>> > +  int i;
>> > +  crc = crc ^ data;
>> > +
>> > +  for (i = 0; i < 8; i++) {
>> > +      if (crc & 1)
>> > +     crc = (crc >> 1) ^ 0x82F63B78;
>> > +      else
>> > +     crc = (crc >> 1);
>> > +    }
>> > +
>> > +  return crc;
>> > +}
>> > +
>> > +int main ()
>> > +{
>> > +  uint32_t crc = 0x0D800D80;
>> > +  for (uint8_t i = 0; i < 0xff; i++)
>> > +    {
>> > +      uint32_t res1 = _crc32_O0 (crc, i);
>> > +      uint32_t res2 = _crc32 (crc, i);
>> > +      if (res1 != res2)
>> > +      abort ();
>> > +      crc = res1;
>> > +    }
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "calculates CRC!" "crc"} } */
>> > +/* { dg-final { scan-tree-dump-times "Couldn't generate faster CRC
>> code." 0 "crc"} } */
>> > +/* { dg-final { scan-rtl-dump "UNSPEC_CRC32C" "dfinish"} } */
>> > +/* { dg-final { scan-rtl-dump-times "pmull" 0 "dfinish"} } */
>>
>

Reply via email to