I didn't find the patch in my gmail box, so I'll reply there.
>@@ -268,6 +269,7 @@ enum processor_features
> FEATURE_USER_MSR,
> FEATURE_AVX10_1 = 114,
> FEATURE_AVX10_2 = 116,
>+ FEATURE_AVX512BMM,
> FEATURE_AMX_AVX512,
> FEATURE_AMX_TF32,
The new added feature should be inserted at the end(before CPU_FEATURE_MAX)
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_vbitrevb_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_epi8 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_vbitrevb_epi8 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_undefined_epi8 (),
You can directly use _mm512_undefined_epi32 and _mm512_setzero_epi32,
and the definition of _mm512_setzero_epi8/_mm512_undefined_epi8 is not
needed.
extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_maskz_vbitrevb_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm128_setzero_epi8 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_vbitrevb_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm128_undefined_epi8 (),
+ (__mmask16) -1);
+}
Similar here.
+# AVX512BMM builtins
+DEF_FUNCTION_TYPE (V16QI, V16QI, UHI)
+DEF_FUNCTION_TYPE (V32QI, V32QI, USI)
+DEF_FUNCTION_TYPE (V64QI, V64QI, UDI)
+
I didn't see any builtin defined with V16QI_FTYPE_V16QI_UHI, so it's redundant?
+ case V16QI_FTYPE_V16QI_UHI:
+ case V32QI_FTYPE_V32QI_USI:
+ case V64QI_FTYPE_V64QI_UDI:
+ nargs = 2;
Ditto.
+(define_mode_iterator VI1_AVX512BMM_HI
+ [V32HI (V16HI "TARGET_AVX512VL")])
Better with name of VI2_256_512_AVX512VL
+(define_mode_iterator VI1_AVX512BMM_QI
+ [V64QI (V32QI "TARGET_AVX512VL") (V16QI "TARGET_AVX512VL")])
You can use existed VI1_AVX512VL, no need to define a new mode iterator.
Other parts, the implementation looks ok.
--
BR,
Hongtao