[PATCH] D159250: [X86][RFC] Add new option `-m[no-]evex512` to disable ZMM and 64-bit mask instructions for AVX512 features

Phoebe Wang via Phabricator via cfe-commits Wed, 30 Aug 2023 23:50:46 -0700

pengfei created this revision.
pengfei added reviewers: RKSimon, skan, jyknight, e-kud.
Herald added a subscriber: hiraditya.
Herald added a project: All.
pengfei requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.


This is an alternative of D157485 <https://reviews.llvm.org/D157485> and a 
pre-feature to support AVX10.

AVX10 Architecture Specification: 
https://cdrdv2.intel.com/v1/dl/getContent/784267
AVX10 Technical Paper: https://cdrdv2.intel.com/v1/dl/getContent/784343
RFC: https://discourse.llvm.org/t/rfc-design-for-avx10-feature-support/72661

Based on the feedbacks from LLVM and GCC community, we have agreed to
start from supporting `-m[no-]evex512` on existing AVX512 features.
The option `-mno-evex512` can be used with `-mavx512xxx` to build
binaries that can run on both legacy AVX512 targets and AVX10-256.

There're still arguments about what's the expected behavior when this
option as well as `-mavx512xxx` used together with `-mavx10.1-256`. We
decided to defer the support of `-mavx10.1` after we made consensus.
Or furthermore, we start from supporting AVX10.2 and not providing any
AVX10.1 options.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D159250

Files:
  clang/docs/ReleaseNotes.rst
  clang/include/clang/Driver/Options.td
  clang/lib/Basic/Targets/X86.cpp
  clang/lib/Basic/Targets/X86.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/lib/CodeGen/Targets/X86.cpp
  clang/test/CodeGen/X86/avx512-error.c
  clang/test/CodeGen/attr-cpuspecific.c
  clang/test/CodeGen/attr-target-x86.c
  clang/test/CodeGen/regcall2.c
  clang/test/CodeGen/target-avx-abi-diag.c
  clang/test/Driver/x86-target-features.c
  clang/test/Preprocessor/x86_target_features.c
  llvm/include/llvm/TargetParser/X86TargetParser.def
  llvm/lib/IR/Verifier.cpp
  llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
  llvm/lib/Target/X86/X86.td
  llvm/lib/Target/X86/X86InstrInfo.td
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/lib/Target/X86/X86Subtarget.cpp
  llvm/lib/Target/X86/X86Subtarget.h
  llvm/lib/TargetParser/X86TargetParser.cpp
  llvm/test/CodeGen/X86/avx512bwvl-arith.ll
  llvm/test/CodeGen/X86/avx512vl-arith.ll

Index: llvm/test/CodeGen/X86/avx512vl-arith.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512vl-arith.ll
+++ llvm/test/CodeGen/X86/avx512vl-arith.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl,-evex512 --show-mc-encoding| FileCheck %s
 
 ; 256-bit
 
Index: llvm/test/CodeGen/X86/avx512bwvl-arith.ll
===================================================================
--- llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s
 
 ; 256-bit
 
Index: llvm/lib/TargetParser/X86TargetParser.cpp
===================================================================
--- llvm/lib/TargetParser/X86TargetParser.cpp
+++ llvm/lib/TargetParser/X86TargetParser.cpp
@@ -72,7 +72,7 @@
 constexpr FeatureBitset FeaturesX86_64_V3 =
     FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C |
     FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE;
-constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 |
+constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | FeatureEVEX512 |
                                             FeatureAVX512BW | FeatureAVX512CD |
                                             FeatureAVX512DQ | FeatureAVX512VL;
 
@@ -96,8 +96,8 @@
 // Intel Knights Landing and Knights Mill
 // Knights Landing has feature parity with Broadwell.
 constexpr FeatureBitset FeaturesKNL =
-    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD |
-    FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
+    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureEVEX512 |
+    FeatureAVX512CD | FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
 constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ;
 
 // Intel Skylake processors.
@@ -107,9 +107,9 @@
 // SkylakeServer inherits all SkylakeClient features except SGX.
 // FIXME: That doesn't match gcc.
 constexpr FeatureBitset FeaturesSkylakeServer =
-    (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD |
-    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB |
-    FeaturePKU;
+    (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureEVEX512 |
+    FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL |
+    FeatureCLWB | FeaturePKU;
 constexpr FeatureBitset FeaturesCascadeLake =
     FeaturesSkylakeServer | FeatureAVX512VNNI;
 constexpr FeatureBitset FeaturesCooperLake =
@@ -117,9 +117,9 @@
 
 // Intel 10nm processors.
 constexpr FeatureBitset FeaturesCannonlake =
-    FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
-    FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
-    FeaturePKU | FeatureSHA;
+    FeaturesSkylakeClient | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD |
+    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA |
+    FeatureAVX512VBMI | FeaturePKU | FeatureSHA;
 constexpr FeatureBitset FeaturesICLClient =
     FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 |
     FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureGFNI | FeatureRDPID |
@@ -230,11 +230,11 @@
                                                 FeatureINVPCID | FeaturePKU |
                                                 FeatureVAES | FeatureVPCLMULQDQ;
 static constexpr FeatureBitset FeaturesZNVER4 =
-    FeaturesZNVER3 | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
-    FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
-    FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512BITALG |
-    FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | FeatureGFNI |
-    FeatureSHSTK;
+    FeaturesZNVER3 | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD |
+    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA |
+    FeatureAVX512VBMI | FeatureAVX512VBMI2 | FeatureAVX512VNNI |
+    FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 |
+    FeatureGFNI | FeatureSHSTK;
 
 // D151696 tranplanted Mangling and OnlyForCPUDispatchSpecific from
 // X86TargetParser.def to here. They are assigned by following ways:
@@ -542,6 +542,7 @@
 constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1;
 constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2;
 constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX;
+constexpr FeatureBitset ImpliedFeaturesEVEX512 = {};
 constexpr FeatureBitset ImpliedFeaturesAVX512F =
     FeatureAVX2 | FeatureF16C | FeatureFMA;
 
Index: llvm/lib/Target/X86/X86Subtarget.h
===================================================================
--- llvm/lib/Target/X86/X86Subtarget.h
+++ llvm/lib/Target/X86/X86Subtarget.h
@@ -264,7 +264,8 @@
   // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
   // disable them in the legalizer.
   bool useAVX512Regs() const {
-    return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
+    return hasAVX512() && hasEVEX512() &&
+           (canExtendTo512DQ() || RequiredVectorWidth > 256);
   }
 
   bool useLight256BitInstructions() const {
Index: llvm/lib/Target/X86/X86Subtarget.cpp
===================================================================
--- llvm/lib/Target/X86/X86Subtarget.cpp
+++ llvm/lib/Target/X86/X86Subtarget.cpp
@@ -268,6 +268,17 @@
   if (!FS.empty())
     FullFS = (Twine(FullFS) + "," + FS).str();
 
+  // Attach EVEX512 feature when we have AVX512 features and EVEX512 is not set.
+  size_t posNoEVEX512 = FS.rfind("-evex512");
+  size_t posEVEX512 = FS.rfind("+evex512");
+  size_t posAVX512 = FS.rfind("+avx512");
+
+  if (posAVX512 != StringRef::npos) {
+    if ((posNoEVEX512 == StringRef::npos && posEVEX512 == StringRef::npos) ||
+        (posNoEVEX512 != StringRef::npos && posAVX512 > posNoEVEX512))
+    FullFS += ",+evex512";
+  }
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
 
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1030,7 +1030,14 @@
   bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
       VirtReg, Order, Hints, MF, VRM, Matrix);
 
-  if (RC.getID() != X86::TILERegClassID)
+  unsigned ID = RC.getID();
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  if ((ID == X86::VK64RegClassID || ID == X86::VK64WMRegClassID) &&
+      Subtarget.hasAVX512() && !Subtarget.hasEVEX512())
+    report_fatal_error(
+        "64-bit mask registers are not supported without EVEX512");
+
+  if (ID != X86::TILERegClassID)
     return BaseImplRetVal;
 
   ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
Index: llvm/lib/Target/X86/X86InstrInfo.td
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.td
+++ llvm/lib/Target/X86/X86InstrInfo.td
@@ -903,6 +903,7 @@
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasEVEX512   : Predicate<"Subtarget->hasEVEX512()">;
 def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
Index: llvm/lib/Target/X86/X86.td
===================================================================
--- llvm/lib/Target/X86/X86.td
+++ llvm/lib/Target/X86/X86.td
@@ -119,6 +119,8 @@
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
                        "Support 16-bit floating point conversion instructions",
                        [FeatureAVX]>;
+def FeatureEVEX512  : SubtargetFeature<"evex512", "HasEVEX512", "true",
+                        "Support ZMM and 64-bit mask instructions">;
 def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2, FeatureFMA, FeatureF16C]>;
@@ -817,6 +819,7 @@
   ];
 
   list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
+    FeatureEVEX512,
     FeatureBWI,
     FeatureCDI,
     FeatureDQI,
@@ -940,6 +943,7 @@
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
                                                   FeatureAVX512,
+                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -982,6 +986,7 @@
 
   // Cannonlake
   list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -1262,6 +1267,7 @@
                                         FeatureF16C,
                                         FeatureFSGSBase,
                                         FeatureAVX512,
+                                        FeatureEVEX512,
                                         FeatureERI,
                                         FeatureCDI,
                                         FeaturePFI,
@@ -1471,6 +1477,7 @@
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
   list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
+                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
Index: llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -285,6 +285,7 @@
                             SmallVectorImpl<char> &CB) const;
 
   PrefixKind emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
+                                 const MCSubtargetInfo &STI,
                                  SmallVectorImpl<char> &CB) const;
 
   void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI,
@@ -841,7 +842,7 @@
   // REX prefix is optional, but if used must be immediately before the opcode
   // Encoding type for this instruction.
   return (TSFlags & X86II::EncodingMask)
-             ? emitVEXOpcodePrefix(MemoryOperand, MI, CB)
+             ? emitVEXOpcodePrefix(MemoryOperand, MI, STI, CB)
              : emitOpcodePrefix(MemoryOperand, MI, STI, CB);
 }
 
@@ -860,6 +861,7 @@
 /// \returns the used prefix.
 PrefixKind
 X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
+                                      const MCSubtargetInfo &STI,
                                       SmallVectorImpl<char> &CB) const {
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
@@ -919,6 +921,9 @@
 
   Prefix.setL(TSFlags & X86II::VEX_L);
   Prefix.setL2(TSFlags & X86II::EVEX_L2);
+  if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) &&
+      !STI.hasFeature(X86::FeatureEVEX512))
+    report_fatal_error("ZMM registers are not supported without EVEX512");
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD:
     Prefix.setPP(0x1); // 66
Index: llvm/lib/IR/Verifier.cpp
===================================================================
--- llvm/lib/IR/Verifier.cpp
+++ llvm/lib/IR/Verifier.cpp
@@ -2030,6 +2030,17 @@
               "' does not apply to function return values",
           V);
 
+  unsigned MaxParameterWidth = 0;
+  auto GetMaxParameterWidth = [&MaxParameterWidth](Type *Ty) {
+    if (Ty->isVectorTy()) {
+      if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+        unsigned Size = VT->getPrimitiveSizeInBits().getFixedValue();
+        if (Size > MaxParameterWidth)
+          MaxParameterWidth = Size;
+      }
+    }
+  };
+  GetMaxParameterWidth(FT->getReturnType());
   verifyParameterAttrs(RetAttrs, FT->getReturnType(), V);
 
   // Verify parameter attributes.
@@ -2048,6 +2059,7 @@
     }
 
     verifyParameterAttrs(ArgAttrs, Ty, V);
+    GetMaxParameterWidth(Ty);
 
     if (ArgAttrs.hasAttribute(Attribute::Nest)) {
       Check(!SawNest, "More than one parameter has attribute nest!", V);
@@ -2203,6 +2215,16 @@
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
 
+  // Check EVEX512 feature.
+  if (MaxParameterWidth >= 512 && Attrs.hasFnAttr("target-features")) {
+    Triple T(M.getTargetTriple());
+    if (T.isX86()) {
+      StringRef TF = Attrs.getFnAttr("target-features").getValueAsString();
+      Check(!TF.contains("+avx512f") || !TF.contains("-evex512"),
+            "512-bit vector arguments require 'evex512' for AVX512", V);
+    }
+  }
+
   checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-prefix", V);
   checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-entry", V);
   checkUnsignedBaseTenFuncAttr(Attrs, "warn-stack-size", V);
Index: llvm/include/llvm/TargetParser/X86TargetParser.def
===================================================================
--- llvm/include/llvm/TargetParser/X86TargetParser.def
+++ llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -240,6 +240,7 @@
 X86_FEATURE       (SM3,             "sm3")
 X86_FEATURE       (SM4,             "sm4")
 X86_FEATURE       (AVXVNNIINT16,    "avxvnniint16")
+X86_FEATURE       (EVEX512,         "evex512")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
Index: clang/test/Preprocessor/x86_target_features.c
===================================================================
--- clang/test/Preprocessor/x86_target_features.c
+++ clang/test/Preprocessor/x86_target_features.c
@@ -64,6 +64,7 @@
 // AVX512F: #define __AVX2__ 1
 // AVX512F: #define __AVX512F__ 1
 // AVX512F: #define __AVX__ 1
+// AVX512F: #define __EVEX512__ 1
 // AVX512F: #define __SSE2_MATH__ 1
 // AVX512F: #define __SSE2__ 1
 // AVX512F: #define __SSE3__ 1
@@ -79,6 +80,7 @@
 // AVX512CD: #define __AVX512CD__ 1
 // AVX512CD: #define __AVX512F__ 1
 // AVX512CD: #define __AVX__ 1
+// AVX512CD: #define __EVEX512__ 1
 // AVX512CD: #define __SSE2_MATH__ 1
 // AVX512CD: #define __SSE2__ 1
 // AVX512CD: #define __SSE3__ 1
@@ -94,6 +96,7 @@
 // AVX512ER: #define __AVX512ER__ 1
 // AVX512ER: #define __AVX512F__ 1
 // AVX512ER: #define __AVX__ 1
+// AVX512ER: #define __EVEX512__ 1
 // AVX512ER: #define __SSE2_MATH__ 1
 // AVX512ER: #define __SSE2__ 1
 // AVX512ER: #define __SSE3__ 1
@@ -109,6 +112,7 @@
 // AVX512PF: #define __AVX512F__ 1
 // AVX512PF: #define __AVX512PF__ 1
 // AVX512PF: #define __AVX__ 1
+// AVX512PF: #define __EVEX512__ 1
 // AVX512PF: #define __SSE2_MATH__ 1
 // AVX512PF: #define __SSE2__ 1
 // AVX512PF: #define __SSE3__ 1
@@ -124,6 +128,7 @@
 // AVX512DQ: #define __AVX512DQ__ 1
 // AVX512DQ: #define __AVX512F__ 1
 // AVX512DQ: #define __AVX__ 1
+// AVX512DQ: #define __EVEX512__ 1
 // AVX512DQ: #define __SSE2_MATH__ 1
 // AVX512DQ: #define __SSE2__ 1
 // AVX512DQ: #define __SSE3__ 1
@@ -139,6 +144,7 @@
 // AVX512BW: #define __AVX512BW__ 1
 // AVX512BW: #define __AVX512F__ 1
 // AVX512BW: #define __AVX__ 1
+// AVX512BW: #define __EVEX512__ 1
 // AVX512BW: #define __SSE2_MATH__ 1
 // AVX512BW: #define __SSE2__ 1
 // AVX512BW: #define __SSE3__ 1
@@ -154,6 +160,7 @@
 // AVX512VL: #define __AVX512F__ 1
 // AVX512VL: #define __AVX512VL__ 1
 // AVX512VL: #define __AVX__ 1
+// AVX512VL: #define __EVEX512__ 1
 // AVX512VL: #define __SSE2_MATH__ 1
 // AVX512VL: #define __SSE2__ 1
 // AVX512VL: #define __SSE3__ 1
@@ -168,6 +175,7 @@
 // AVX512F2: #define __AVX2__ 1
 // AVX512F2-NOT: #define __AVX512F__ 1
 // AVX512F2-NOT: #define __AVX512PF__ 1
+// AVX512F2-NOT: #define __EVEX512__ 1
 // AVX512F2: #define __AVX__ 1
 // AVX512F2: #define __SSE2_MATH__ 1
 // AVX512F2: #define __SSE2__ 1
@@ -184,6 +192,7 @@
 // AVX512IFMA: #define __AVX512F__ 1
 // AVX512IFMA: #define __AVX512IFMA__ 1
 // AVX512IFMA: #define __AVX__ 1
+// AVX512IFMA: #define __EVEX512__ 1
 // AVX512IFMA: #define __SSE2_MATH__ 1
 // AVX512IFMA: #define __SSE2__ 1
 // AVX512IFMA: #define __SSE3__ 1
@@ -200,6 +209,7 @@
 // AVX512VBMI: #define __AVX512F__ 1
 // AVX512VBMI: #define __AVX512VBMI__ 1
 // AVX512VBMI: #define __AVX__ 1
+// AVX512VBMI: #define __EVEX512__ 1
 // AVX512VBMI: #define __SSE2_MATH__ 1
 // AVX512VBMI: #define __SSE2__ 1
 // AVX512VBMI: #define __SSE3__ 1
@@ -216,6 +226,7 @@
 // AVX512BITALG: #define __AVX512BW__ 1
 // AVX512BITALG: #define __AVX512F__ 1
 // AVX512BITALG: #define __AVX__ 1
+// AVX512BITALG: #define __EVEX512__ 1
 // AVX512BITALG: #define __SSE2_MATH__ 1
 // AVX512BITALG: #define __SSE2__ 1
 // AVX512BITALG: #define __SSE3__ 1
@@ -230,6 +241,7 @@
 
 // AVX512VBMINOAVX512BW-NOT: #define __AVX512BW__ 1
 // AVX512VBMINOAVX512BW-NOT: #define __AVX512VBMI__ 1
+// AVX512VBMINOAVX512BW: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vbmi2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VBMI2 %s
 
@@ -238,6 +250,7 @@
 // AVX512VBMI2: #define __AVX512F__ 1
 // AVX512VBMI2: #define __AVX512VBMI2__ 1
 // AVX512VBMI2: #define __AVX__ 1
+// AVX512VBMI2: #define __EVEX512__ 1
 // AVX512VBMI2: #define __SSE2_MATH__ 1
 // AVX512VBMI2: #define __SSE2__ 1
 // AVX512VBMI2: #define __SSE3__ 1
@@ -251,11 +264,13 @@
 
 // AVX512VBMI2NOAVX512BW-NOT: #define __AVX512BW__ 1
 // AVX512VBMI2NOAVX512BW-NOT: #define __AVX512VBMI2__ 1
+// AVX512VBMI2NOAVX512BW: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bitalg -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BITALGNOAVX512BW %s
 
 // AVX512BITALGNOAVX512BW-NOT: #define __AVX512BITALG__ 1
 // AVX512BITALGNOAVX512BW-NOT: #define __AVX512BW__ 1
+// AVX512BITALGNOAVX512BW: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42POPCNT %s
 
@@ -467,24 +482,29 @@
 // AVX512BF16: #define __AVX512BF16__ 1
 // AVX512BF16: #define __AVX512BW__ 1
 // AVX512BF16-NOT: #define __AVX512VL__ 1
+// AVX512BF16: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bf16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BF16_NOAVX512BW %s
 
 // AVX512BF16_NOAVX512BW-NOT: #define __AVX512BF16__ 1
+// AVX512BF16_NOAVX512BW: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bf16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BF16_NOAVX512VL %s
 
 // AVX512BF16_NOAVX512VL: #define __AVX512BF16__ 1
+// AVX512BF16_NOAVX512VL: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512vp2intersect -x c -E -dM -o - %s | FileCheck  -check-prefix=VP2INTERSECT %s
 
 // VP2INTERSECT: #define __AVX512F__ 1
 // VP2INTERSECT: #define __AVX512VP2INTERSECT__ 1
+// VP2INTERSECT: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx512vp2intersect -x c -E -dM -o - %s | FileCheck  -check-prefix=NOVP2INTERSECT %s
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512vp2intersect -mno-avx512f -x c -E -dM -o - %s | FileCheck  -check-prefix=NOVP2INTERSECT %s
 
 // NOVP2INTERSECT-NOT: #define __AVX512VP2INTERSECT__ 1
+// NOVP2INTERSECT-NOT: #define __EVEX512__ 1
 
 
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mkl -x c -E -dM -o - %s | FileCheck  -check-prefix=KEYLOCKER %s
@@ -591,21 +611,37 @@
 // AVX512FP16: #define __AVX512DQ__ 1
 // AVX512FP16: #define __AVX512FP16__ 1
 // AVX512FP16: #define __AVX512VL__ 1
+// AVX512FP16: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s
 
 // AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1
 // AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1
+// AVX512FP16NOAVX512VL: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s
 
 // AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1
 // AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1
+// AVX512FP16NOAVX512BW: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s
 
 // AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1
 // AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1
+// AVX512FP16NOAVX512DQ: #define __EVEX512__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
+// NOEVEX512-NOT: #define __AVX512F__ 1
+// NOEVEX512-NOT: #define __EVEX512__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
+// AVX512NOEVEX512: #define __AVX512F__ 1
+// AVX512NOEVEX512-NOT: #define __EVEX512__ 1
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -march=atom -mcmpccxadd -x c -E -dM -o - %s | FileCheck  -check-prefix=CMPCCXADD %s
 
Index: clang/test/Driver/x86-target-features.c
===================================================================
--- clang/test/Driver/x86-target-features.c
+++ clang/test/Driver/x86-target-features.c
@@ -369,6 +369,11 @@
 // AVXVNNIINT16: "-target-feature" "+avxvnniint16"
 // NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16"
 
+// RUN: %clang --target=i386 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EVEX512 %s
+// RUN: %clang --target=i386 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-EVEX512 %s
+// EVEX512: "-target-feature" "+evex512"
+// NO-EVEX512: "-target-feature" "-evex512"
+
 // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
 // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
 // CRC32: "-target-feature" "+crc32"
Index: clang/test/CodeGen/target-avx-abi-diag.c
===================================================================
--- clang/test/CodeGen/target-avx-abi-diag.c
+++ clang/test/CodeGen/target-avx-abi-diag.c
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -verify=no256,no512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx -verify=no512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -verify=both -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature +evex512 -verify=both -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -DAVX512_256 -verify=avx512-256 -o - -S
 // REQUIRES: x86-registered-target
 
 // both-no-diagnostics
@@ -16,6 +18,7 @@
 void variadic(int i, ...);
 __attribute__((target("avx512f"))) void variadic_err(int i, ...);
 
+#ifndef AVX512_256
 // If neither side has an attribute, warn.
 void call_warn(void) {
   avx256Type t1;
@@ -27,15 +30,18 @@
   variadic(1, t1); // no256-warning {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
   variadic(3, t2); // no512-warning {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
+#endif
 
 // If only 1 side has an attribute, error.
 void call_errors(void) {
   avx256Type t1;
   takesAvx256(t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
   avx512fType t2;
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   takesAvx512(t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 
   variadic_err(1, t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
 
Index: clang/test/CodeGen/regcall2.c
===================================================================
--- clang/test/CodeGen/regcall2.c
+++ clang/test/CodeGen/regcall2.c
@@ -21,8 +21,8 @@
 // FIXME: Do we need to change for Windows?
 // Win: define dso_local x86_regcallcc void @__regcall3__foo(ptr noalias sret(%struct.__sVector) align 64 %agg.result, i32 noundef %a) #0
 // Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr noundef %a) #0
-// Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" }
+// Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" }
 
 // Lin: define dso_local x86_regcallcc %struct.__sVector @__regcall3__foo(i32 noundef %a) #0
 // Lin: define dso_local x86_regcallcc double @__regcall3__bar([4 x <8 x double>] %a.coerce0, [4 x <16 x float>] %a.coerce1) #0
-// Lin: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="512" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" }
+// Lin: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="512" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" }
Index: clang/test/CodeGen/attr-target-x86.c
===================================================================
--- clang/test/CodeGen/attr-target-x86.c
+++ clang/test/CodeGen/attr-target-x86.c
@@ -69,4 +69,4 @@
 // CHECK: "target-cpu"="x86-64-v3"
 // CHECK-SAME: "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
 // CHECK: "target-cpu"="x86-64-v4"
-// CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
+// CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
Index: clang/test/CodeGen/attr-cpuspecific.c
===================================================================
--- clang/test/CodeGen/attr-cpuspecific.c
+++ clang/test/CodeGen/attr-cpuspecific.c
@@ -353,7 +353,7 @@
 
 // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="ivybridge"
-// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="knl"
 // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87"
 // CHECK-SAME: "tune-cpu"="atom"
Index: clang/test/CodeGen/X86/avx512-error.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/X86/avx512-error.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512dq -target-feature -evex512 -emit-llvm -verify
+
+#include <immintrin.h>
+
+__m512d test_mm512_sqrt_pd(__m512d a)
+{
+  // CHECK-LABEL: @test_mm512_sqrt_pd
+  return __builtin_ia32_sqrtpd512(a, _MM_FROUND_CUR_DIRECTION); // expected-error {{'__builtin_ia32_sqrtpd512' needs target feature evex512}}
+}
Index: clang/lib/CodeGen/Targets/X86.cpp
===================================================================
--- clang/lib/CodeGen/Targets/X86.cpp
+++ clang/lib/CodeGen/Targets/X86.cpp
@@ -1508,6 +1508,24 @@
   return false;
 }
 
+static bool checkAVX512ParamFeature(DiagnosticsEngine &Diag,
+                                    SourceLocation CallLoc,
+                                    const llvm::StringMap<bool> &CallerMap,
+                                    const llvm::StringMap<bool> &CalleeMap,
+                                    QualType Ty, bool IsArgument) {
+  bool Caller256 = CallerMap.lookup("avx512f") && !CallerMap.lookup("evex512");
+  bool Callee256 = CallerMap.lookup("avx512f") && !CallerMap.lookup("evex512");
+
+  // Forbid 512-bit or larger vector pass or return when we disabled ZMM
+  // instructions.
+  if (Caller256 || Callee256)
+    return Diag.Report(CallLoc, diag::err_avx_calling_convention)
+           << IsArgument << Ty << "evex512";
+
+  return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty,
+                              "avx512f", IsArgument);
+}
+
 static bool checkAVXParam(DiagnosticsEngine &Diag, ASTContext &Ctx,
                           SourceLocation CallLoc,
                           const llvm::StringMap<bool> &CallerMap,
@@ -1515,8 +1533,8 @@
                           bool IsArgument) {
   uint64_t Size = Ctx.getTypeSize(Ty);
   if (Size > 256)
-    return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty,
-                                "avx512f", IsArgument);
+    return checkAVX512ParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty,
+                                   IsArgument);
 
   if (Size > 128)
     return checkAVXParamFeature(Diag, CallLoc, CallerMap, CalleeMap, Ty, "avx",
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -4067,6 +4067,8 @@
 
   void checkTargetFeatures(const CallExpr *E, const FunctionDecl *TargetDecl);
   void checkTargetFeatures(SourceLocation Loc, const FunctionDecl *TargetDecl);
+  void checkTargetVectorWidth(const CallExpr *E, const FunctionDecl *TargetDecl,
+                              unsigned VectorWidth);
 
   llvm::CallInst *EmitRuntimeCall(llvm::FunctionCallee callee,
                                   const Twine &name = "");
Index: clang/lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2573,6 +2573,20 @@
   return checkTargetFeatures(E->getBeginLoc(), TargetDecl);
 }
 
+// Emits an error if the builtin's vector width >= 512 and evex512 feature is
+// not set.
+void CodeGenFunction::checkTargetVectorWidth(const CallExpr *E,
+                                             const FunctionDecl *TargetDecl,
+                                             unsigned VectorWidth) {
+  if (!getTarget().getTriple().isX86() || VectorWidth < 512)
+    return;
+  llvm::StringMap<bool> FeatureMap;
+  CGM.getContext().getFunctionFeatureMap(FeatureMap, TargetDecl);
+  if (FeatureMap.lookup("avx512f") && !FeatureMap.lookup("evex512"))
+    CGM.getDiags().Report(E->getBeginLoc(), diag::err_builtin_needs_feature)
+        << TargetDecl->getDeclName() << "evex512";
+}
+
 // Emits an error if we don't have a valid set of target features for the
 // called function.
 void CodeGenFunction::checkTargetFeatures(SourceLocation Loc,
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5471,8 +5471,11 @@
   // can move this up to the beginning of the function.
   checkTargetFeatures(E, FD);
 
-  if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
+  if (unsigned VectorWidth =
+          getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID)) {
+    checkTargetVectorWidth(E, FD, VectorWidth);
     LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
+  }
 
   // See if we have a target specific intrinsic.
   StringRef Name = getContext().BuiltinInfo.getName(BuiltinID);
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -95,6 +95,7 @@
   bool HasLWP = false;
   bool HasFMA = false;
   bool HasF16C = false;
+  bool HasEVEX512 = false;
   bool HasAVX512CD = false;
   bool HasAVX512VPOPCNTDQ = false;
   bool HasAVX512VNNI = false;
Index: clang/lib/Basic/Targets/X86.cpp
===================================================================
--- clang/lib/Basic/Targets/X86.cpp
+++ clang/lib/Basic/Targets/X86.cpp
@@ -119,6 +119,7 @@
     setFeatureEnabled(Features, F, true);
 
   std::vector<std::string> UpdatedFeaturesVec;
+  bool HasEVEX512 = false;
   for (const auto &Feature : FeaturesVec) {
     // Expand general-regs-only to -x86, -mmx and -sse
     if (Feature == "+general-regs-only") {
@@ -128,8 +129,17 @@
       continue;
     }
 
+    // AVX512F will enable EVEX512.
+    if (!HasEVEX512 && Feature.substr(0, 7) == "+avx512")
+      HasEVEX512 = true;
+    // Disable AVX512F will disable EVEX512.
+    if (HasEVEX512 && (Feature == "-avx512f" || Feature == "-evex512"))
+      HasEVEX512 = false;
+
     UpdatedFeaturesVec.push_back(Feature);
   }
+  if (HasEVEX512)
+    UpdatedFeaturesVec.push_back("+evex512");
 
   if (!TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec))
     return false;
@@ -228,6 +238,8 @@
       HasF16C = true;
     } else if (Feature == "+gfni") {
       HasGFNI = true;
+    } else if (Feature == "+evex512") {
+      HasEVEX512 = true;
     } else if (Feature == "+avx512cd") {
       HasAVX512CD = true;
     } else if (Feature == "+avx512vpopcntdq") {
@@ -731,6 +743,8 @@
   if (HasGFNI)
     Builder.defineMacro("__GFNI__");
 
+  if (HasEVEX512)
+    Builder.defineMacro("__EVEX512__");
   if (HasAVX512CD)
     Builder.defineMacro("__AVX512CD__");
   if (HasAVX512VPOPCNTDQ)
@@ -986,6 +1000,7 @@
       .Case("crc32", true)
       .Case("cx16", true)
       .Case("enqcmd", true)
+      .Case("evex512", true)
       .Case("f16c", true)
       .Case("fma", true)
       .Case("fma4", true)
@@ -1093,6 +1108,7 @@
       .Case("cx8", HasCX8)
       .Case("cx16", HasCX16)
       .Case("enqcmd", HasENQCMD)
+      .Case("evex512", HasEVEX512)
       .Case("f16c", HasF16C)
       .Case("fma", HasFMA)
       .Case("fma4", XOPLevel >= FMA4)
@@ -1533,8 +1549,9 @@
       return Size <= 64;
     case 'z':
       // XMM0/YMM/ZMM0
-      if (hasFeatureEnabled(FeatureMap, "avx512f"))
-        // ZMM0 can be used if target supports AVX512F.
+      if (hasFeatureEnabled(FeatureMap, "avx512f") &&
+          hasFeatureEnabled(FeatureMap, "evex512"))
+        // ZMM0 can be used if target supports AVX512F and EVEX512 is set.
         return Size <= 512U;
       else if (hasFeatureEnabled(FeatureMap, "avx"))
         // YMM0 can be used if target supports AVX.
@@ -1553,8 +1570,10 @@
     break;
   case 'v':
   case 'x':
-    if (hasFeatureEnabled(FeatureMap, "avx512f"))
-      // 512-bit zmm registers can be used if target supports AVX512F.
+    if (hasFeatureEnabled(FeatureMap, "avx512f") &&
+        hasFeatureEnabled(FeatureMap, "evex512"))
+      // 512-bit zmm registers can be used if target supports AVX512F and
+      // EVEX512 is set.
       return Size <= 512U;
     else if (hasFeatureEnabled(FeatureMap, "avx"))
       // 256-bit ymm registers can be used if target supports AVX.
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -5733,6 +5733,8 @@
 def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
 def menqcmd : Flag<["-"], "menqcmd">, Group<m_x86_Features_Group>;
 def mno_enqcmd : Flag<["-"], "mno-enqcmd">, Group<m_x86_Features_Group>;
+def mevex512 : Flag<["-"], "mevex512">, Group<m_x86_Features_Group>;
+def mno_evex512 : Flag<["-"], "mno-evex512">, Group<m_x86_Features_Group>;
 def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
 def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
 def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -264,6 +264,9 @@
 X86 Support
 ^^^^^^^^^^^
 
+- Added option ``-m[no-]evex512`` to disable ZMM and 64-bit mask instructions
+  for AVX512 features.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D159250: [X86][RFC] Add new option `-m[no-]evex512` to disable ZMM and 64-bit mask instructions for AVX512 features

Reply via email to