llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-x86 Author: None (yubingex007-a11y) <details> <summary>Changes</summary> In https://reviews.llvm.org/D125075, we switched to use FastPreTileConfig in O0 and abandoned X86PreAMXConfigPass. --- Patch is 34.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69569.diff 9 Files Affected: - (modified) clang/docs/tools/clang-formatted-files.txt (-1) - (modified) llvm/include/llvm/CodeGen/Passes.h (-3) - (modified) llvm/lib/Target/X86/CMakeLists.txt (-1) - (modified) llvm/lib/Target/X86/X86.h (-1) - (removed) llvm/lib/Target/X86/X86PreAMXConfig.cpp (-415) - (modified) llvm/lib/Target/X86/X86TargetMachine.cpp (-1) - (removed) llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll (-178) - (modified) llvm/tools/opt/opt.cpp (-1) - (modified) llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn (-1) ``````````diff diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt index 16f84727117e28d..48cd800bffd0046 100644 --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -6813,7 +6813,6 @@ llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86LowerTileCopy.cpp -llvm/lib/Target/X86/X86PreAMXConfig.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterBankInfo.h llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 598c0b838c1b97d..8d14eef949e91b4 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -584,9 +584,6 @@ namespace llvm { /// or split the data to two <128 x i32>. FunctionPass *createX86LowerAMXTypePass(); - /// The pass insert tile config intrinsics for AMX fast register allocation. - FunctionPass *createX86PreAMXConfigPass(); - /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. FunctionPass *createX86LowerAMXIntrinsicsPass(); diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index c387d59ea981a52..0b7a98ad6341dde 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -33,7 +33,6 @@ set(sources X86DiscriminateMemOps.cpp X86LowerTileCopy.cpp X86LowerAMXType.cpp - X86PreAMXConfig.cpp X86LowerAMXIntrinsics.cpp X86TileConfig.cpp X86FastPreTileConfig.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 3c5ca0788498032..485afbc1dfbc241 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -194,7 +194,6 @@ void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86PartialReductionPass(PassRegistry &); -void initializeX86PreAMXConfigPassPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp deleted file mode 100644 index 7872a64061d438c..000000000000000 --- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp +++ /dev/null @@ -1,415 +0,0 @@ -//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// Insert tilecfg for each area of key AMX intrinsic. -/// All the key AMX intrinsic's tile operand must come from tileload. And the -/// def tile of key AMX intrinsic must be tilestored. -/// take tdpbssd for example: -/// -------------------------------------------------------------------------- -/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key -/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) | -/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx -/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) | -/// call void @llvm.x86.tilestored64.internal(... td) area -/// -------------------------------------------------------------------------- -/// This pass will insert tilecfg before every key-amx-area, some like: -/// -------------------------------------------------------------------------- -/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem -/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init -/// ... -/// ... pre-config shape of %t1 * -/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -/// ... * -/// ... pre-config shape of %t2 * shapes -/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * -/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -/// ... -/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config -// -//===----------------------------------------------------------------------===// -// -#include "X86.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsX86.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "pre-amx-config" - -static bool isAMXIntrinsic(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) - return true; - return II->getType()->isX86_AMXTy(); -} - -static bool isTileLoad(IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal || - II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal; -} - -static bool isTileStore(IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal; -} - -#ifndef NDEBUG -static bool onlyTileDef(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) - return false; - return II->getType()->isX86_AMXTy(); -} - -static bool brokenVolatile(Instruction *I) { - // Todo: it is weak to identify a normal call here. - if ((isa<CallInst>(I) && !isa<IntrinsicInst>(I)) || I->isTerminator()) - return true; - return false; -} -#endif - -namespace { -class X86PreAMXConfig { - using PosAndShapesMap = MapVector<Instruction *, SmallVector<Value *, 8>>; - - Function &F; - -public: - X86PreAMXConfig(Function &Func) : F(Func) {} - bool preTileConfig(); - void addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes); - bool findConfigShapes(PosAndShapesMap &PosAndShapes); - bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes); - void preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder, - SmallVector<Value *, 8> &Shapes); - BasicBlock::iterator - getShapesAndConfigPosEnd(BasicBlock::iterator Iter, - SmallVector<Value *, 8> &Shapes); - bool checkVolatileModel(SmallSet<Value *, 4> &Loads, IntrinsicInst *Store, - IntrinsicInst *KeyAMX); -}; - -// Orderly write the shapes in tilecfg's mem. This maybe not right. -// Because the first shape may not corresponding to the first tmm register, -// so we need to handle at at X86FastTileConfig::materializeTileCfg() -// after register allocation. -// For example: -// -------------------------------------------------------------------------- -// zeroinitialize tilecfg's mem (of ldtilecfg) -// -------------------------------------------------------------------------- -// ... pre-config shape of %t1 * -// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 * -// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 * -// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -// ... * -// ... pre-config shape of %t2 * -// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 * -// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 * -// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes -// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -// ... * -// ... pre-config shape of %t3 * of -// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 * -// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 * -// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * -// ... * tiles -// ... pre-config shape of %td * -// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 * -// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 * -// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * -// -------------------------------------------------------------------------- -// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx -// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(... td) area -// -------------------------------------------------------------------------- -void X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder, - SmallVector<Value *, 8> &Shapes) { - LLVMContext &Ctx = Builder.getContext(); - Type *I8Ty = Type::getInt8Ty(Ctx); - Type *I16Ty = Type::getInt16Ty(Ctx); - - // TODO: Currently we defaultly set Palette = 1, it may be assigned to - // other value in the future. - Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0); - Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1); - Value *PalettePos = Builder.CreateGEP(I8Ty, I8Ptr, PaletteOffset); - Builder.CreateStore(PaletteValue, PalettePos); - - for (int I = 0, E = Shapes.size() / 2; I < E; I++) { - Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I); - Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2); - const std::string ShapeName = "amx.tmm." + itostr(I); - Value *RowPos = Builder.CreateGEP(I8Ty, I8Ptr, RowOffset, - ShapeName + ".shape.row"); - Value *ColPos = Builder.CreateGEP(I8Ty, I8Ptr, ColOffset); - ColPos = Builder.CreateBitCast(ColPos, PointerType::get(I16Ty, 0), - ShapeName + ".shape.col"); - Value *Row = Shapes[I * 2]; - Value *Col = Shapes[I * 2 + 1]; - Row = Builder.CreateTrunc(Row, I8Ty); - Builder.CreateStore(Row, RowPos); - Builder.CreateStore(Col, ColPos); - } -} - -void X86PreAMXConfig::addTileConfig(Instruction *ModelStart, - SmallVector<Value *, 8> &Shapes) { - Module *M = F.getParent(); - IRBuilder<> Builder(ModelStart); - const DataLayout &DL = M->getDataLayout(); - unsigned AddrSpace = DL.getAllocaAddrSpace(); - LLVMContext &Ctx = Builder.getContext(); - Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false); - Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx)); - - AllocaInst *Addr = - new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front()); - Addr->setAlignment(Alignment); - Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); - - Builder.CreateAlignedStore(Constant::getNullValue(V512Ty), Addr, Alignment); - - preWriteTileCfg(I8Ptr, Builder, Shapes); - - Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, std::nullopt, - {I8Ptr}); -} - -// Todo: We may need to handle "more than one store" case in the future. -bool X86PreAMXConfig::checkVolatileModel(SmallSet<Value *, 4> &Loads, - IntrinsicInst *Store, - IntrinsicInst *KeyAMX) { - Value *ST = Store->getOperand(4); - - // Only has tileload and tilestore. - if (!KeyAMX) - return (Loads.size() == 1) && Loads.contains(ST); - - // All Loads should be operands of KeyAMX. - // All tile operands of KeyAMX should come from Loads. - for (Value *Op : KeyAMX->operands()) { - if (Op->getType()->isX86_AMXTy()) - if (!Loads.erase(Op)) - return false; - } - - // The def of KeyAMX should be stored into mem. - // Todo: is it key amx can be no def? - return Loads.empty() && (ST == cast<Value>(KeyAMX)); -} - -bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX, - SmallVector<Value *, 8> &Shapes) { - for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) { - Value *Op = KeyAMX->getOperand(I); - if (!Op->getType()->isX86_AMXTy()) - continue; - IntrinsicInst *TileDef = dyn_cast<IntrinsicInst>(Op); - assert((TileDef && isTileLoad(TileDef)) && - "All KeyAMX's tile definiation should comes from TileLoad!"); - Shapes.push_back(TileDef->getOperand(0)); - Shapes.push_back(TileDef->getOperand(1)); - } - if (!isTileStore(KeyAMX)) { - Shapes.push_back(KeyAMX->getOperand(0)); - Shapes.push_back(KeyAMX->getOperand(1)); - } - return Shapes.size() != 0; -} - -// Collect the shapes and skip the area of current key amx intrinsic. -// -// For example: -// ... -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k) -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k) -// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k) -// -------------------------------------------------------------------------- -BasicBlock::iterator -X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter, - SmallVector<Value *, 8> &Shapes) { - IntrinsicInst *KeyAMX = nullptr; - BasicBlock *BB = Iter->getParent(); - BasicBlock::iterator PosEnd = BB->end(); - SmallSet<Value *, 4> Loads; - - // See TileStore as "Config Position End" and check volatile model. - for (auto I = Iter, E = BB->end(); I != E; ++I) { - assert(!brokenVolatile(&*I) && "Not reach tile store!"); - IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I); - if (!II || !isAMXIntrinsic(II)) - continue; - - if (isTileLoad(II)) { - Loads.insert(II); - } else if (isTileStore(II)) { - if (!checkVolatileModel(Loads, II, KeyAMX)) - report_fatal_error("Not Volatile AMX Model!"); - PosEnd = I; - break; - } else { - assert(!KeyAMX && "Too many key amx intrinsic!"); - KeyAMX = II; - } - } - assert(PosEnd != BB->end() && "Not find TileStore!"); - - // See KeyAMX as TileStore if only TileLoad and TileStore. - if (!KeyAMX) - KeyAMX = dyn_cast<IntrinsicInst>(&*PosEnd); - - // Get Shapes in order. - assert(Shapes.empty() && "Shapes should be clean."); - getKeyAMXShapes(KeyAMX, Shapes); - - return PosEnd; -} - -// Record a key amx area's shapes with its position. -// Use the first tileload as its position. -// For example: -// ... -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) / -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes: -// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n) -// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n) -// -------------------------------------------------------------------------- -bool X86PreAMXConfig::findConfigShapes(PosAndShapesMap &PosAndShapes) { - bool Find = false; - for (BasicBlock &BB : F) { - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*I); - if (!II) - continue; - if (!isAMXIntrinsic(II)) - continue; - assert(onlyTileDef(II) && "Not volatile model for AMX at O0!"); - - I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]); - Find = true; - } - } - return Find; -} - -// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic. -// e.g. (key amx = tdpbssd) -// -------------------------------------------------------------------------- -// %cfgmem = alloca <16 x i32>, align 4 * allocate mem -// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init -// ... -// ... pre-config shape of %t1 * -// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -// ... * -// ... pre-config shape of %t2 * -// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes -// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -// ... * -// ... pre-config shape of %t3 * of -// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * -// ... * tiles -// ... pre-config shape of %td * -// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * -// -// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx -// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(... td) area -// -------------------------------------------------------------------------- -bool X86PreAMXConfig::preTileConfig() { - PosAndShapesMap PosAndShapes; - bool NeedCfg = findConfigShapes(PosAndShapes); - if (!NeedCfg) - return false; - for (auto &IPAndShapes : PosAndShapes) - addTileConfig(IPAndShapes.first, IPAndShapes.second); - - return true; -} -} // anonymous namespace - -namespace { - -class X86PreAMXConfigPass : public FunctionPass { -public: - static char ID; - - X86PreAMXConfigPass() : FunctionPass(ID) { - initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - bool C = false; - - // Prepare for fast register allocation at O0. - if (TM->getOptLevel() == CodeGenOptLevel::None) { - - // We pre-config each key AMX intrinsic at O0. - // In theory, one tile config can cover several AMX intrinsics, but - // it is very diffcult to classify the tile shapes at O0. So here we - // let thing be easy, pre-config every key AMX intrinsic. - X86PreAMXConfig PCFG(F); - C = PCFG.preTileConfig(); - } - - return C; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<TargetPassConfig>(); - } -}; - -} // anonymous namespace - -static const char PassName[] = "Pre AMX Tile Config"; -char X86PreAMXConfigPass::ID = 0; -INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, fals... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/69569 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits