Hi, This patch adds support for AMD znver6 processor. For avx512bmm instructions, this patch adds the intrinsics support. Code-generation and runtime tests will be added in the future patches.
Bootstrapped and tested on x86. Thank you, Umesh >From f8b45a4d969f2003fa798b987c95879bbf6c3e0b Mon Sep 17 00:00:00 2001 From: Umesh Kalvakuntla <[email protected]> Date: Sat, 6 Dec 2025 22:26:51 +0530 Subject: [PATCH] x86: Add AMD znver6 processor support For avx512bmm instructions, this patch adds the intrinsics support. Code-generation and runtime tests will be added in the future patches. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_amd_cpu): Add znver6 1Ah family model numbers. (get_available_features): Set feature AVX512BMM. * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AVX512BMM_SET): New macro. (OPTION_MASK_ISA2_AVX512BMM_UNSET): New macro. (OPTION_MASK_ISA2_AVX512BW_UNSET): Unset AVX512BMM. (ix86_handle_option): Likewise. * common/config/i386/i386-cpuinfo.h (enum processor_subtypes): Add AMDFAM1AH_ZNVER6. (enum processor_features): Add FEATURE_AVX512BMM. * common/config/i386/i386-isas.h: Likewise. * config.gcc: Add avx512bmmintrin.h, avx512bmmvlintrin.h, znver6. * config/i386/cpuid.h (bit_AVX512BMM): * config/i386/driver-i386.cc (host_detect_local_cpu): Likewise. * config/i386/i386-builtin-types.def (V16QI): New builtin type. (V32QI): Likewise. (V64QI): Likewise. * config/i386/i386-builtin.def (BDESC): Add AVX512BMM builtins. * config/i386/i386-c.cc (ix86_target_macros_internal): Likewise. * config/i386/i386-expand.cc (ix86_expand_args_builtin): Likewise. * config/i386/i386-isa.def (AVX512BMM): Likewise. * config/i386/i386-options.cc (m_ZNVER6): New macro. (m_ZNVER): Add m_ZNVER6. (ix86_valid_target_attribute_inner_p): Likewise. * config/i386/i386.cc (ix86_reassociation_width): Likewise. * config/i386/i386.h (enum processor_type): Likewise. * config/i386/i386.md: Likewise. * config/i386/i386.opt: Likewise. * config/i386/i386.opt.urls: Likewise. * config/i386/immintrin.h: Likewise. * config/i386/sse.md (avx512bmm_vbmacor16x16x16_<mode>): New define_insn. (avx512bmm_vbmacxor16x16x16_<mode>): Likewise. (avx512bmm_vbitrevb_<mode>_mask): Likewise. (avx512bmm_vbitrevb_<mode>): Likewise. * config/i386/x86-tune-costs.h (struct processor_costs): Add znver6_cost table (replicates znver5_cost table for now). * config/i386/x86-tune-sched.cc (ix86_issue_rate): Set issue rate to 8. (ix86_adjust_cost): Likewise. * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add m_ZNVER6. (X86_TUNE_FUSE_MOV_AND_ALU): Likewise. (X86_TUNE_USE_SCATTER_2PARTS): Likewise. (X86_TUNE_USE_SCATTER_4PARTS): Likewise. (X86_TUNE_USE_SCATTER_8PARTS): Likewise. (X86_TUNE_AVOID_256FMA_CHAINS): Likewise. (X86_TUNE_AVOID_512FMA_CHAINS): Likewise. (X86_TUNE_AVX512_MOVE_BY_PIECES): Likewise. * doc/extend.texi: Likewise. * doc/invoke.texi: Likewise. * config/i386/avx512bmmintrin.h: New file. * config/i386/avx512bmmvlintrin.h: New file. gcc/testsuite/ChangeLog: * g++.target/i386/mv29.C: Likewise. * gcc.target/i386/funcspec-56.inc: Likewise. * gcc.target/i386/avx512bmm-1.c: New test. * gcc.target/i386/avx512bmm-builtin.c: New test. * gcc.target/i386/avx512bmmvl-1.c: New test. * gcc.target/i386/avx512bmmvl-builtin.c: New test. --- gcc/common/config/i386/cpuinfo.h | 25 +++++++ gcc/common/config/i386/i386-common.cc | 26 ++++++- gcc/common/config/i386/i386-cpuinfo.h | 2 + gcc/common/config/i386/i386-isas.h | 1 + gcc/config.gcc | 14 +++- gcc/config/i386/avx512bmmintrin.h | 105 +++++++++++++++++++++++++++ gcc/config/i386/avx512bmmvlintrin.h | 140 ++++++++++++++++++++++++++++++++++++ gcc/config/i386/cpuid.h | 3 + gcc/config/i386/driver-i386.cc | 5 ++ gcc/config/i386/i386-builtin-types.def | 5 ++ gcc/config/i386/i386-builtin.def | 8 +++ gcc/config/i386/i386-c.cc | 9 +++ gcc/config/i386/i386-expand.cc | 5 ++ gcc/config/i386/i386-isa.def | 1 + gcc/config/i386/i386-options.cc | 8 ++- gcc/config/i386/i386.cc | 2 +- gcc/config/i386/i386.h | 3 + gcc/config/i386/i386.md | 2 +- gcc/config/i386/i386.opt | 4 ++ gcc/config/i386/i386.opt.urls | 2 + gcc/config/i386/immintrin.h | 4 ++ gcc/config/i386/sse.md | 60 ++++++++++++++++ gcc/config/i386/x86-tune-costs.h | 154 ++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/x86-tune-sched.cc | 5 ++ gcc/config/i386/x86-tune.def | 18 ++--- gcc/doc/extend.texi | 3 + gcc/doc/invoke.texi | 13 +++- gcc/testsuite/g++.target/i386/mv29.C | 6 ++ gcc/testsuite/gcc.target/i386/avx512bmm-1.c | 26 +++++++ gcc/testsuite/gcc.target/i386/avx512bmm-builtin.c | 26 +++++++ gcc/testsuite/gcc.target/i386/avx512bmmvl-1.c | 35 +++++++++ gcc/testsuite/gcc.target/i386/avx512bmmvl-builtin.c | 34 +++++++++ gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + 33 files changed, 738 insertions(+), 18 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index dbad4a1dba6..b72eba8fec6 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -319,6 +319,14 @@ get_amd_cpu (struct __processor_model *cpu_model, CHECK___builtin_cpu_is ("znver5"); cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; } + else if ((model >= 0x50 && model <= 0x5f) || + (model >= 0x80 && model <= 0xcf) || + (model >= 0xd8 && model <= 0xe7)) + { + cpu = "znver6"; + CHECK___builtin_cpu_is ("znver6"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6; + } else if (has_cpu_feature (cpu_model, cpu_features2, FEATURE_AVX512VP2INTERSECT)) { @@ -326,6 +334,13 @@ get_amd_cpu (struct __processor_model *cpu_model, CHECK___builtin_cpu_is ("znver5"); cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; } + else if (has_cpu_feature (cpu_model, cpu_features2, + FEATURE_AVX512BMM)) + { + cpu = "znver6"; + CHECK___builtin_cpu_is ("znver6"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6; + } break; default: break; @@ -1049,6 +1064,16 @@ get_available_features (struct __processor_model *cpu_model, } } + /* Get Advanced Features at level 0x21 (eax = 0x21). */ + if (max_cpuid_level >= 0x21) + { + __cpuid (0x21, eax, ebx, ecx, edx); + if (eax & bit_AVX512BMM) + { + set_feature (FEATURE_AVX512BMM); + } + } + /* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0). */ if (avx10_set && max_cpuid_level >= 0x24) { diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 96136c5f41f..f919ad51de7 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -87,6 +87,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA_AVX512BITALG_SET \ (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512BW_SET) #define OPTION_MASK_ISA2_AVX512BF16_SET OPTION_MASK_ISA2_AVX512BF16 +#define OPTION_MASK_ISA2_AVX512BMM_SET OPTION_MASK_ISA2_AVX512BMM #define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED @@ -272,6 +273,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA_AVX512VPOPCNTDQ_UNSET OPTION_MASK_ISA_AVX512VPOPCNTDQ #define OPTION_MASK_ISA_AVX512BITALG_UNSET OPTION_MASK_ISA_AVX512BITALG #define OPTION_MASK_ISA2_AVX512BF16_UNSET OPTION_MASK_ISA2_AVX512BF16 +#define OPTION_MASK_ISA2_AVX512BMM_UNSET OPTION_MASK_ISA2_AVX512BMM #define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED @@ -393,7 +395,8 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_AVX512BW_UNSET \ (OPTION_MASK_ISA2_AVX512BF16_UNSET \ - | OPTION_MASK_ISA2_AVX512FP16_UNSET) + | OPTION_MASK_ISA2_AVX512FP16_UNSET \ + | OPTION_MASK_ISA2_AVX512BMM_UNSET) /* Set 1 << value as value of -malign-FLAG option. */ @@ -938,6 +941,21 @@ ix86_handle_option (struct gcc_options *opts, } return true; + case OPT_mavx512bmm: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BMM_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_SET; + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET; + opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BMM_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_UNSET; + } + return true; + case OPT_mavxvnni: if (value) { @@ -2151,7 +2169,8 @@ const char *const processor_names[] = "znver2", "znver3", "znver4", - "znver5" + "znver5", + "znver6" }; /* Guarantee that the array is aligned with enum processor_type. */ @@ -2410,6 +2429,9 @@ const pta processor_alias_table[] = {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5, PTA_ZNVER5, M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F}, + {"znver6", PROCESSOR_ZNVER6, CPU_ZNVER6, + PTA_ZNVER6, + M_CPU_SUBTYPE (AMDFAM1AH_ZNVER6), P_PROC_AVX512F}, {"btver1", PROCESSOR_BTVER1, CPU_GENERIC, PTA_BTVER1, M_CPU_TYPE (AMD_BTVER1), P_PROC_SSE4_A}, diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index 63357da9bd4..e5e09ae0738 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -104,6 +104,7 @@ enum processor_subtypes INTEL_COREI7_PANTHERLAKE, ZHAOXIN_FAM7H_YONGFENG, AMDFAM1AH_ZNVER5, + AMDFAM1AH_ZNVER6, ZHAOXIN_FAM7H_SHIJIDADAO, INTEL_COREI7_DIAMONDRAPIDS, INTEL_COREI7_NOVALAKE, @@ -268,6 +269,7 @@ enum processor_features FEATURE_USER_MSR, FEATURE_AVX10_1 = 114, FEATURE_AVX10_2 = 116, + FEATURE_AVX512BMM, FEATURE_AMX_AVX512, FEATURE_AMX_TF32, FEATURE_AMX_FP8 = 120, diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h index fcd3ab280f5..a8511103795 100644 --- a/gcc/common/config/i386/i386-isas.h +++ b/gcc/common/config/i386/i386-isas.h @@ -185,6 +185,7 @@ ISA_NAMES_TABLE_START ISA_NAMES_TABLE_ENTRY("usermsr", FEATURE_USER_MSR, P_NONE, "-musermsr") ISA_NAMES_TABLE_ENTRY("avx10.1", FEATURE_AVX10_1, P_AVX10_1, "-mavx10.1") ISA_NAMES_TABLE_ENTRY("avx10.2", FEATURE_AVX10_2, P_NONE, "-mavx10.2") + ISA_NAMES_TABLE_ENTRY("avx512bmm", FEATURE_AVX512BMM, P_NONE, "-mavx512bmm") ISA_NAMES_TABLE_ENTRY("amx-avx512", FEATURE_AMX_AVX512, P_NONE, "-mamx-avx512") ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32") diff --git a/gcc/config.gcc b/gcc/config.gcc index fb465dac147..858d6612eff 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -444,8 +444,8 @@ i[34567]86-*-* | x86_64-*-*) avx512vbmiintrin.h avx512vbmivlintrin.h avx512vpopcntdqintrin.h clwbintrin.h mwaitxintrin.h clzerointrin.h pkuintrin.h sgxintrin.h cetintrin.h - gfniintrin.h cet.h avx512vbmi2intrin.h - avx512vbmi2vlintrin.h avx512vnniintrin.h + gfniintrin.h cet.h avx512vbmi2intrin.h avx512bmmintrin.h + avx512bmmvlintrin.h avx512vbmi2vlintrin.h avx512vnniintrin.h avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h avx512bitalgvlintrin.h pconfigintrin.h wbnoinvdintrin.h @@ -722,7 +722,7 @@ c7 esther" # 64-bit x86 processors supported by --with-arch=. Each processor # MUST be separated by exactly one space. x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \ -bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 btver1 btver2 k8 k8-sse3 \ +bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 znver6 btver1 btver2 k8 k8-sse3 \ opteron opteron-sse3 nocona core2 corei7 corei7-avx core-avx-i core-avx2 \ atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ silvermont skylake-avx512 cannonlake icelake-client icelake-server \ @@ -3829,6 +3829,10 @@ case ${target} in arch=znver5 cpu=znver5 ;; + znver6-*) + arch=znver6 + cpu=znver6 + ;; bdver4-*) arch=bdver4 cpu=bdver4 @@ -3974,6 +3978,10 @@ case ${target} in arch=znver5 cpu=znver5 ;; + znver6-*) + arch=znver6 + cpu=znver6 + ;; bdver4-*) arch=bdver4 cpu=bdver4 diff --git a/gcc/config/i386/avx512bmmintrin.h b/gcc/config/i386/avx512bmmintrin.h new file mode 100644 index 00000000000..2436b7aa7a6 --- /dev/null +++ b/gcc/config/i386/avx512bmmintrin.h @@ -0,0 +1,105 @@ +/* Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BMMINTRIN_H_INCLUDED +#define _AVX512BMMINTRIN_H_INCLUDED + +#ifndef __AVX512BMM__ +#pragma GCC push_options +#pragma GCC target("avx512bmm") +#define __DISABLE_AVX512BMM__ +#endif /* __AVX512BMM__ */ + +#define _mm512_undefined_epi8 _mm512_undefined_epi32 + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_epi8 (void) +{ + return __extension__ (__m512i)(__v64qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_vbmacor16x16x16_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vbmacor16x16x16_v32hi ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_vbmacxor16x16x16_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vbmacxor16x16x16_v32hi ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __C); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_vbitrevb_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_vbitrevb_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_epi8 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_vbitrevb_epi8 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_undefined_epi8 (), + (__mmask64) -1); +} + +#ifdef __DISABLE_AVX512BMM__ +#undef __DISABLE_AVX512BMM__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BMM__ */ + +#endif /* _AVX512BMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx512bmmvlintrin.h b/gcc/config/i386/avx512bmmvlintrin.h new file mode 100644 index 00000000000..b49ce476cb7 --- /dev/null +++ b/gcc/config/i386/avx512bmmvlintrin.h @@ -0,0 +1,140 @@ +/* Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BMMVLINTRIN_H_INCLUDED +#define _AVX512BMMVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BMM__) +#pragma GCC push_options +#pragma GCC target("avx512bmm,avx512vl") +#define __DISABLE_AVX512BMMVL__ +#endif /* __AVX512BMM__ */ + +#define _mm128_undefined_epi8 _mm_avx512_undefined_si128 + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm128_setzero_epi8 (void) +{ + return __extension__ (__m128i)(__v16qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +#define _mm256_undefined_epi8 _mm256_avx512_undefined_si256 + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_epi8 (void) +{ + return __extension__ (__m256i)(__v32qi) { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_vbmacor16x16x16_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vbmacor16x16x16_v16hi ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_vbmacxor16x16x16_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vbmacxor16x16x16_v16hi ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __C); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm128_mask_vbitrevb_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm128_maskz_vbitrevb_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A, + (__v16qi) + _mm128_setzero_epi8 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm128_vbitrevb_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A, + (__v16qi) + _mm128_undefined_epi8 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_vbitrevb_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_vbitrevb_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_epi8 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_vbitrevb_epi8 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A, + (__v32qi) + _mm256_undefined_epi8 (), + (__mmask32) -1); +} + +#ifdef __DISABLE_AVX512BMMVL__ +#undef __DISABLE_AVX512BMMVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BMMVL__ */ + +#endif /* _AVX512BMMVLINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 04149c1640e..c03d71979b6 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -167,6 +167,9 @@ #define bit_AESKLE ( 1<<0 ) #define bit_WIDEKL ( 1<<2 ) +/* Sub leaf (%eax == 0x21) */ +#define bit_AVX512BMM ( 1<<23 ) + /* AMX sub leaf (%eax == 0x1e, %ecx == 1) */ /* %eax */ #define bit_AMX_FP8 (1 << 4) diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index b54f0af0a2a..abfcb26ac8c 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -466,6 +466,8 @@ const char *host_detect_local_cpu (int argc, const char **argv) processor = PROCESSOR_GEODE; else if (has_feature (FEATURE_MOVBE) && family == 22) processor = PROCESSOR_BTVER2; + else if (has_feature (FEATURE_AVX512BMM)) + processor = PROCESSOR_ZNVER6; else if (has_feature (FEATURE_AVX512VP2INTERSECT)) processor = PROCESSOR_ZNVER5; else if (has_feature (FEATURE_AVX512F)) @@ -830,6 +832,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_ZNVER5: cpu = "znver5"; break; + case PROCESSOR_ZNVER6: + cpu = "znver6"; + break; case PROCESSOR_BTVER1: cpu = "btver1"; break; diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 64bde021d11..2f31cdf0069 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1398,6 +1398,11 @@ DEF_FUNCTION_TYPE (V8SF, PCV16HF) DEF_FUNCTION_TYPE (V4SF, PCV8BF) DEF_FUNCTION_TYPE (V8SF, PCV16BF) +# AVX512BMM builtins +DEF_FUNCTION_TYPE (V16QI, V16QI, UHI) +DEF_FUNCTION_TYPE (V32QI, V32QI, USI) +DEF_FUNCTION_TYPE (V64QI, V64QI, UDI) + # CMPccXADD builtins DEF_FUNCTION_TYPE (INT, PINT, INT, INT, INT) DEF_FUNCTION_TYPE (LONGLONG, PLONGLONG, LONGLONG, LONGLONG, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index fe42c64364f..e443c6acb47 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2881,6 +2881,14 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_d BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI) BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, "__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) FLOAT_FTYPE_BFLOAT16) +/* AVX512BMM. */ +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v16hi, "__builtin_ia32_vbmacor16x16x16_v16hi", IX86_BUILTIN_VBMACORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI) +BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v32hi, "__builtin_ia32_vbmacor16x16x16_v32hi", IX86_BUILTIN_VBMACORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v16hi, "__builtin_ia32_vbmacxor16x16x16_v16hi", IX86_BUILTIN_VBMACXORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI) +BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v32hi, "__builtin_ia32_vbmacxor16x16x16_v32hi", IX86_BUILTIN_VBMACXORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v16qi_mask, "__builtin_ia32_vbitrevb128_mask", IX86_BUILTIN_VBITREV16_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v32qi_mask, "__builtin_ia32_vbitrevb256_mask", IX86_BUILTIN_VBITREV32_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v64qi_mask, "__builtin_ia32_vbitrevb512_mask", IX86_BUILTIN_VBITREV64_MASK, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI) /* AVX512FP16. */ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 2d92cee458c..72f9e924712 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -140,6 +140,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__znver5"); def_or_undef (parse_in, "__znver5__"); break; + case PROCESSOR_ZNVER6: + def_or_undef (parse_in, "__znver6"); + def_or_undef (parse_in, "__znver6__"); + break; case PROCESSOR_BTVER1: def_or_undef (parse_in, "__btver1"); def_or_undef (parse_in, "__btver1__"); @@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_ZNVER5: def_or_undef (parse_in, "__tune_znver5__"); break; + case PROCESSOR_ZNVER6: + def_or_undef (parse_in, "__tune_znver6__"); + break; case PROCESSOR_BTVER1: def_or_undef (parse_in, "__tune_btver1__"); break; @@ -537,6 +544,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, ; } + if (isa_flag2 & OPTION_MASK_ISA2_AVX512BMM) + def_or_undef (parse_in, "__AVX512BMM__"); if (isa_flag2 & OPTION_MASK_ISA2_WBNOINVD) def_or_undef (parse_in, "__WBNOINVD__"); if (isa_flag2 & OPTION_MASK_ISA2_AVX512VP2INTERSECT) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index fd9bcaa8541..9a5ab2539d6 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -12623,6 +12623,11 @@ ix86_expand_args_builtin (const struct builtin_description *d, nargs = 2; nargs_constant = 1; break; + case V16QI_FTYPE_V16QI_UHI: + case V32QI_FTYPE_V32QI_USI: + case V64QI_FTYPE_V64QI_UDI: + nargs = 2; + break; case V16QI_FTYPE_V16QI_V16QI_V16QI: case V8SF_FTYPE_V8SF_V8SF_V8SF: case V4DF_FTYPE_V4DF_V4DF_V4DF: diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def index a1d994c66fa..143e2e38f9e 100644 --- a/gcc/config/i386/i386-isa.def +++ b/gcc/config/i386/i386-isa.def @@ -120,6 +120,7 @@ DEF_PTA(APX_F) DEF_PTA(USER_MSR) DEF_PTA(AVX10_1) DEF_PTA(AVX10_2) +DEF_PTA(AVX512BMM) DEF_PTA(AMX_AVX512) DEF_PTA(AMX_TF32) DEF_PTA(AMX_FP8) diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 35064d83a00..fd55650c1e6 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -177,11 +177,12 @@ along with GCC; see the file COPYING3. If not see #define m_ZNVER3 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER3) #define m_ZNVER4 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER4) #define m_ZNVER5 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER5) +#define m_ZNVER6 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER6) #define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1) #define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2) #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4) #define m_BTVER (m_BTVER1 | m_BTVER2) -#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5) +#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6) #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \ | m_ZNVER) @@ -263,6 +264,7 @@ static struct ix86_target_opts isa2_opts[] = { "-musermsr", OPTION_MASK_ISA2_USER_MSR }, { "-mavx10.1", OPTION_MASK_ISA2_AVX10_1 }, { "-mavx10.2", OPTION_MASK_ISA2_AVX10_2 }, + { "-mavx512bmm", OPTION_MASK_ISA2_AVX512BMM }, { "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 }, { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 }, { "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 }, @@ -811,7 +813,8 @@ static const struct processor_costs *processor_cost_table[] = &znver2_cost, /* PROCESSOR_ZNVER2. */ &znver3_cost, /* PROCESSOR_ZNVER3. */ &znver4_cost, /* PROCESSOR_ZNVER4. */ - &znver5_cost /* PROCESSOR_ZNVER5. */ + &znver5_cost, /* PROCESSOR_ZNVER5. */ + &znver6_cost /* PROCESSOR_ZNVER6. */ }; /* Guarantee that the array is aligned with enum processor_type. */ @@ -1122,6 +1125,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("usermsr", OPT_musermsr), IX86_ATTR_ISA ("avx10.1", OPT_mavx10_1), IX86_ATTR_ISA ("avx10.2", OPT_mavx10_2), + IX86_ATTR_ISA ("avx512bmm", OPT_mavx512bmm), IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512), IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32), IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8), diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index db43045753b..dc8f01b09c1 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -25543,7 +25543,7 @@ ix86_reassociation_width (unsigned int op, machine_mode mode) return 1; /* Znver5 can do 2 integer multiplications per cycle with latency of 3. */ - if (ix86_tune == PROCESSOR_ZNVER5 + if ((ix86_tune == PROCESSOR_ZNVER5 || ix86_tune == PROCESSOR_ZNVER6) && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) width = 6; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index b93411796af..2355f40f2dc 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2377,6 +2377,7 @@ enum processor_type PROCESSOR_ZNVER3, PROCESSOR_ZNVER4, PROCESSOR_ZNVER5, + PROCESSOR_ZNVER6, PROCESSOR_max }; @@ -2522,6 +2523,8 @@ constexpr wide_int_bitmask PTA_ZNVER4 = PTA_ZNVER3 | PTA_AVX512F | PTA_AVX512DQ | PTA_AVX512VNNI | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ; constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI; +constexpr wide_int_bitmask PTA_ZNVER6 = PTA_ZNVER5 | PTA_AVXVNNIINT8 + | PTA_AVXNECONVERT | PTA_AVX512BMM | PTA_AVXIFMA | PTA_AVX512FP16; constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index df7135f84d4..11cbd547452 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -530,7 +530,7 @@ (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem, atom,slm,glm,haswell,generic,lujiazui,yongfeng,amdfam10,bdver1, bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3,znver4, - znver5" + znver5,znver6" (const (symbol_ref "ix86_schedule"))) ;; A basic instruction type. Refinements due to arguments to be diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c0093ef1243..dbc07a8435b 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1353,6 +1353,10 @@ Target Mask(ISA2_AVX10_2) Var(ix86_isa_flags2) Save Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX10.1 and AVX10.2 built-in functions and code generation. +mavx512bmm +Target Mask(ISA2_AVX512BMM) Var(ix86_isa_flags2) Save +Support AVX512BMM built-in functions and code generation. + mamx-avx512 Target Mask(ISA2_AMX_AVX512) Var(ix86_isa_flags2) Save Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls index 129d91f0c28..831cc889fa4 100644 --- a/gcc/config/i386/i386.opt.urls +++ b/gcc/config/i386/i386.opt.urls @@ -614,3 +614,5 @@ UrlSuffix(gcc/x86-Options.html#index-mmovrs) mamx-movrs UrlSuffix(gcc/x86-Options.html#index-mamx-movrs) +mavx512bmm +UrlSuffix(gcc/x86-Options.html#index-mavx512bmm) diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index f5a11ff4765..643397ac3a4 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -70,6 +70,10 @@ #include <avx512ifmavlintrin.h> +#include <avx512bmmintrin.h> + +#include <avx512bmmvlintrin.h> + #include <avx512vbmiintrin.h> #include <avx512vbmivlintrin.h> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0be898c789e..aa44f760937 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -251,6 +251,11 @@ UNSPEC_MINMAXBF16 UNSPEC_MINMAX + ;; For AVX512BMM support + UNSPEC_VBMACOR + UNSPEC_VBMACXOR + UNSPEC_VBITREV + ;; For MOVRS suppport UNSPEC_VMOVRS ]) @@ -33136,3 +33141,58 @@ (set_attr "prefix" "evex") (set_attr "memory" "load") (set_attr "mode" "<sseinsnmode>")]) + +(define_mode_iterator VI1_AVX512BMM_HI + [V32HI (V16HI "TARGET_AVX512VL")]) + +(define_insn "avx512bmm_vbmacor16x16x16_<mode>" + [(set (match_operand:VI1_AVX512BMM_HI 0 "register_operand" "=v") + (unspec:VI1_AVX512BMM_HI + [(match_operand:VI1_AVX512BMM_HI 1 "register_operand" "0") + (match_operand:VI1_AVX512BMM_HI 2 "register_operand" "v") + (match_operand:VI1_AVX512BMM_HI 3 "nonimmediate_operand" "vm")] + UNSPEC_VBMACOR))] + "TARGET_AVX512BMM" + "vbmacor16x16x16\t{%3, %2, %0|%0, %2, %3}" + [(set_attr ("prefix") ("evex")) + (set_attr "mode" "<sseinsnmode>")]) + + +(define_insn "avx512bmm_vbmacxor16x16x16_<mode>" + [(set (match_operand:VI1_AVX512BMM_HI 0 "register_operand" "=v") + (unspec:VI1_AVX512BMM_HI + [(match_operand:VI1_AVX512BMM_HI 1 "register_operand" "0") + (match_operand:VI1_AVX512BMM_HI 2 "register_operand" "v") + (match_operand:VI1_AVX512BMM_HI 3 "nonimmediate_operand" "vm")] + UNSPEC_VBMACXOR))] + "TARGET_AVX512BMM" + "vbmacxor16x16x16\t{%3, %2, %0|%0, %2, %3}" + [(set_attr ("prefix") ("evex")) + (set_attr "mode" "<sseinsnmode>")]) + +(define_mode_iterator VI1_AVX512BMM_QI + [V64QI (V32QI "TARGET_AVX512VL") (V16QI "TARGET_AVX512VL")]) + +(define_insn "avx512bmm_vbitrevb_<mode>_mask" + [(set (match_operand:VI1_AVX512BMM_QI 0 "register_operand" "=v") + (vec_merge:VI1_AVX512BMM_QI + (unspec:VI1_AVX512BMM_QI + [(match_operand:VI1_AVX512BMM_QI 1 "nonimmediate_operand" "vm")] + UNSPEC_VBITREV) + (match_operand:VI1_AVX512BMM_QI 2 "reg_or_0_operand" "0C") + (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))] + "TARGET_AVX512BMM" + "vbitrevb\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512bmm_vbitrevb_<mode>" + [(set (match_operand:VI1_AVX512BMM_QI 0 "register_operand" "=v") + (unspec:VI1_AVX512BMM_QI + [(match_operand:VI1_AVX512BMM_QI 1 "nonimmediate_operand" "vm")] + UNSPEC_VBITREV) + )] + "TARGET_AVX512BMM" + "vbitrevb\t{%1, %0|%0, %1}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index c7a0f6805ca..942ef748a31 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2402,6 +2402,160 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (2), /* Branch mispredict scale. */ }; +/* This table currently replicates znver5_cost table. */ +struct processor_costs znver6_cost = { + { + /* Start of register allocator costs. integer->integer move cost is 2. */ + + /* reg-reg moves are done by renaming and thus they are even cheaper than + 1 cycle. Because reg-reg move cost is 2 and following tables correspond + to doubles of latencies, we do not model this correctly. It does not + seem to make practical difference to bump prices up even more. */ + 6, /* cost for loading QImode using + movzbl. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + 2, /* cost of reg,reg fld/fst. */ + {14, 14, 17}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {12, 12, 16}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode. */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, 2, 3, /* cost of moving XMM,YMM,ZMM + register. */ + {6, 6, 10, 10, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {8, 8, 8, 12, 12}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + 6, 8, /* SSE->integer and integer->SSE + moves. */ + 8, 8, /* mask->integer and integer->mask moves */ + {6, 6, 6}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {8, 8, 8}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + /* TODO: Lea with 3 components has cost 2. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (3), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit + set. */ + {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (11), /* HI. */ + COSTS_N_INSNS (13), /* SI. */ + COSTS_N_INSNS (16), /* DI. */ + COSTS_N_INSNS (16)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 9, /* MOVE_RATIO. */ + 6, /* CLEAR_RATIO */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + {6, 6, 10, 10, 12}, /* cost of loading SSE registers + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 12, 12}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 10, 10, 12}, /* cost of unaligned loads. */ + {8, 8, 8, 12, 12}, /* cost of unaligned stores. */ + 2, 2, 2, /* cost of moving XMM,YMM,ZMM + register. */ + 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ + /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops, + throughput 5. Approx 7 uops do not depend on vector size and every load + is 5 uops. */ + 14, 10, /* Gather load static, per_elt. */ + 14, 20, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (7), /* cost of FMUL instruction. */ + /* Latency of fdiv is 8-15. */ + COSTS_N_INSNS (15), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + /* Latency of fsqrt is 4-10. */ + COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (3), /* cost of MULSS instruction. */ + COSTS_N_INSNS (3), /* cost of MULSD instruction. */ + COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ + /* 9-13. */ + COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */ + COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */ + COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */ + COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */ + COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */ + COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */ + COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ + COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ + /* Zen5 can execute: + - integer ops: 6 per cycle, at most 3 multiplications. + latency 1 for additions, 3 for multiplications (pipelined) + + Setting width of 9 for multiplication is probably excessive + for register pressure. + - fp ops: 2 additions per cycle, latency 2-3 + 2 multiplicaitons per cycle, latency 3 + - vector intger ops: 4 additions, latency 1 + 2 multiplications, latency 4 + We increase width to 6 for multiplications + in ix86_reassociation_width. */ + 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ + znver2_memcpy, + znver2_memset, + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ + "16", /* Loop alignment. */ + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ + 4, /* Small unroll limit. */ + 2, /* Small unroll factor. */ + COSTS_N_INSNS (2), /* Branch mispredict scale. */ +}; + /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ static stringop_algs skylake_memcpy[2] = { {libcall, diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index 11b33382ecb..772f7af6541 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -113,6 +113,10 @@ ix86_issue_rate (void) case PROCESSOR_NOVALAKE: return 8; + /* Issue rate we are changing to 8 considering the Dispatch width */ + case PROCESSOR_ZNVER6: + return 8; + default: return 1; } @@ -438,6 +442,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: case PROCESSOR_ZNVER5: + case PROCESSOR_ZNVER6: /* Stack engine allows to execute push&pop instructions in parall. */ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index dcd26d59351..c5c0f40358d 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -147,13 +147,14 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, There is also limitation for immediate and displacement supported. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5) + m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 + | m_ZNVER6) /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov and the destination is used by alu. alu must be one of ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", - m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D) + m_ZNVER5 | m_ZNVER6 | m_GRANITERAPIDS | m_GRANITERAPIDS_D) /* X86_TUNE_FUSE_AND_BRANCH_MEM: Fuse alu with a subsequent conditional jump instruction when alu contains memory operand. @@ -519,7 +520,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", - ~(m_ZNVER4 | m_ZNVER5)) + ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6)) /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 elements. */ @@ -530,7 +531,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", - ~(m_ZNVER4 | m_ZNVER5)) + ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6)) /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more elements. */ @@ -541,7 +542,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", - ~(m_ZNVER4 | m_ZNVER5)) + ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6)) /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ @@ -551,13 +552,14 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", - m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID + m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6 | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D | m_DIAMONDRAPIDS | m_CORE_ATOM | m_GENERIC) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) +DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5 + | m_ZNVER6) /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd for v2df vector reduction. */ @@ -622,7 +624,7 @@ DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces", /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit AVX instructions. */ DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces", - m_ZNVER4 | m_ZNVER5) + m_ZNVER4 | m_ZNVER5 | m_ZNVER6) /* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit vectorized loops. */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 11f6b02db36..0854f7bc683 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -29068,6 +29068,9 @@ AMD Family 19h Zen version 4. @item znver5 AMD Family 1ah Zen version 5. + +@item znver6 +AMD Family 1ah Zen version 6. @end table Here is an example: diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index a6d2b54cc7f..b95a1a125bc 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1554,7 +1554,7 @@ See RS/6000 and PowerPC Options. -mnoreturn-no-callee-saved-registers -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx -mavx2 -mavx512f -mavx512cd -mavx512vl --mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi -msha -maes +-mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi -mavx512bmm -msha -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma -mpconfig -mwbnoinvd -mptwrite -mclflushopt -mclwb -mxsavec -mxsaves -msse4a -m3dnow -m3dnowa -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop @@ -35829,6 +35829,17 @@ AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI, AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, PREFETCHI and 64-bit instruction set extensions.) +@item znver6 +AMD Family 1ah core based CPUs with x86-64 instruction set support. (This +supersets BMI, BMI2, CLWB, F16C, FMA, FSGSBASE, AVX, AVX2, ADCX, RDSEED, +MWAITX, SHA, CLZERO, AES, PCLMUL, CX16, MOVBE, MMX, SSE, SSE2, SSE3, SSE4A, +SSSE3, SSE4.1, SSE4.2, ABM, XSAVEC, XSAVES, CLFLUSHOPT, POPCNT, RDPID, +WBNOINVD, PKU, VPCLMULQDQ, VAES, AVX512F, AVX512DQ, AVX512IFMA, AVX512CD, +AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI, +AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B, +AVX512VP2INTERSECT, AVXNECONVERT, AVX512BMM, PREFETCHI and +64-bit instruction set extensions.) + @item btver1 CPUs based on AMD Family 14h cores with x86-64 instruction set support. (This supersets MMX, SSE, SSE2, SSE3, SSSE3, SSE4A, CX16, ABM and 64-bit diff --git a/gcc/testsuite/g++.target/i386/mv29.C b/gcc/testsuite/g++.target/i386/mv29.C index ab229534edd..e0abc2a0f91 100644 --- a/gcc/testsuite/g++.target/i386/mv29.C +++ b/gcc/testsuite/g++.target/i386/mv29.C @@ -57,6 +57,10 @@ int __attribute__ ((target("arch=znver5"))) foo () { return 11; } +int __attribute__ ((target("arch=znver6"))) foo () { + return 12; +} + int main () { int val = foo (); @@ -83,6 +87,8 @@ int main () assert (val == 10); else if (__builtin_cpu_is ("znver5")) assert (val == 11); + else if (__builtin_cpu_is ("znver6")) + assert (val == 12); else assert (val == 0); diff --git a/gcc/testsuite/gcc.target/i386/avx512bmm-1.c b/gcc/testsuite/gcc.target/i386/avx512bmm-1.c new file mode 100644 index 00000000000..79b3d50089a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bmm-1.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bmm -O2" } */ +/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m512i x,y,z; +volatile __mmask64 m; + +__m512i extern +avx512bmm_test (void) +{ + x = _mm512_vbmacor16x16x16_epi16 (x, y, z); + + x = _mm512_vbmacxor16x16x16_epi16 (x, y, z); + + x = _mm512_vbitrevb_epi8 (x); + + x = _mm512_mask_vbitrevb_epi8 (m, x, y); + + x = _mm512_maskz_vbitrevb_epi8 (m, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bmm-builtin.c b/gcc/testsuite/gcc.target/i386/avx512bmm-builtin.c new file mode 100644 index 00000000000..94296480ddf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bmm-builtin.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bmm -O2" } */ +/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ + +typedef char v64qi __attribute__ ((__vector_size__ (64))); +typedef short v32hi __attribute__ ((__vector_size__ (64))); + +v32hi +f1 (v32hi a, v32hi b, v32hi c) +{ + return __builtin_ia32_vbmacor16x16x16_v32hi (a, b, c); +} + +v32hi +f2 (v32hi a, v32hi b, v32hi c) +{ + return __builtin_ia32_vbmacxor16x16x16_v32hi (a, b, c); +} + +v64qi +f3 (v64qi a, v64qi b) +{ + return __builtin_ia32_vbitrevb512_mask (a, b, 3); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bmmvl-1.c b/gcc/testsuite/gcc.target/i386/avx512bmmvl-1.c new file mode 100644 index 00000000000..9128a2588f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bmmvl-1.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bmm -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ + + +#include <immintrin.h> + +volatile __m256i x,y,z; +volatile __m128i x_,y_,z_; +volatile __mmask32 m; +volatile __mmask16 m_; + +void extern +avx512bmm_test (void) +{ + x = _mm256_vbmacor16x16x16_epi16 (x, y, z); + + x = _mm256_vbmacxor16x16x16_epi16 (x, y, z); + + x = _mm256_mask_vbitrevb_epi8 (m, x, y); + x_ = _mm128_mask_vbitrevb_epi8 (m_, x_, y_); + + x = _mm256_maskz_vbitrevb_epi8 (m, y); + x_ = _mm128_maskz_vbitrevb_epi8 (m_, y_); + + x = _mm256_vbitrevb_epi8 (x); + x_ = _mm128_vbitrevb_epi8 (x_); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bmmvl-builtin.c b/gcc/testsuite/gcc.target/i386/avx512bmmvl-builtin.c new file mode 100644 index 00000000000..eed78007429 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bmmvl-builtin.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bmm -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ + +typedef char v32qi __attribute__ ((__vector_size__ (32))); +typedef char v16qi __attribute__ ((__vector_size__ (16))); +typedef short v16hi __attribute__ ((__vector_size__ (32))); + +v16hi +f1 (v16hi a, v16hi b, v16hi c) +{ + return __builtin_ia32_vbmacor16x16x16_v16hi (a, b, c); +} + +v16hi +f2 (v16hi a, v16hi b, v16hi c) +{ + return __builtin_ia32_vbmacxor16x16x16_v16hi (a, b, c); +} + +v32qi +f3 (v32qi a, v32qi b) +{ + return __builtin_ia32_vbitrevb256_mask (a, b, 3); +} + +v16qi +f4 (v16qi a, v16qi b) +{ + return __builtin_ia32_vbitrevb128_mask (a, b, 3); +} diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index f56b344b6c8..aa395185bc7 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -238,6 +238,7 @@ extern void test_arch_znver2 (void) __attribute__((__target__("arch= extern void test_arch_znver3 (void) __attribute__((__target__("arch=znver3"))); extern void test_arch_znver4 (void) __attribute__((__target__("arch=znver4"))); extern void test_arch_znver5 (void) __attribute__((__target__("arch=znver5"))); +extern void test_arch_znver6 (void) __attribute__((__target__("arch=znver6"))); extern void test_tune_nocona (void) __attribute__((__target__("tune=nocona"))); extern void test_tune_core2 (void) __attribute__((__target__("tune=core2"))); @@ -265,6 +266,7 @@ extern void test_tune_znver2 (void) __attribute__((__target__("tune= extern void test_tune_znver3 (void) __attribute__((__target__("tune=znver3"))); extern void test_tune_znver4 (void) __attribute__((__target__("tune=znver4"))); extern void test_tune_znver5 (void) __attribute__((__target__("tune=znver5"))); +extern void test_tune_znver6 (void) __attribute__((__target__("tune=znver6"))); extern void test_fpmath_sse (void) __attribute__((__target__("sse2,fpmath=sse"))); extern void test_fpmath_387 (void) __attribute__((__target__("sse2,fpmath=387"))); -- 2.48.1
