diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp new file mode 100644 index 0000000000000..56e275ce707b6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -0,0 +1,744 @@ +//===- AMDGPUSplitModule.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Implements a module splitting algorithm designed to support the +/// FullLTO --lto-partitions option for parallel codegen. This is completely +/// different from the common SplitModule pass, as this system is designed with +/// AMDGPU in mind. +/// +/// The basic idea of this module splitting implementation is the same as +/// SplitModule: load-balance the module's functions across a set of N +/// partitions to allow parallel codegen. However, it does it very +/// differently than the target-agnostic variant: +/// - Kernels are used as the module's "roots". +/// They're known entry points on AMDGPU, and everything else is often +/// internal only. +/// - Each kernel has a set of dependencies, and when a kernel and its +/// dependencies is considered "big", we try to put it in a partition where +/// most dependencies are already imported, to avoid duplicating large +/// amounts of code. +/// - There's special care for indirect calls in order to ensure +/// AMDGPUResourceUsageAnalysis can work correctly. +/// +/// This file also includes a more elaborate logging system to enable +/// users to easily generate logs that (if desired) do not include any value +/// names, in order to not leak information about the source file. +/// Such logs are very helpful to understand and fix potential issues with +/// module splitting. + +#include "AMDGPUSplitModule.h" +#include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/SHA256.h" +#include "llvm/Support/Threading.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-split-module" + +namespace { + +static cl::opt LargeKernelFactor( + "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), + cl::Hidden, + cl::desc( + "consider a kernel as large and needing special treatment when it " + "exceeds the average cost of a partition by this factor; e;g. 2.0 " + "means if the kernel and its dependencies is 2 times bigger than " + "an average partition; 0 disables large kernels handling entirely")); + +static cl::opt LargeKernelOverlapForMerge( + "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), + cl::Hidden, + cl::desc("defines how much overlap between two large kernel's dependencies " + "is needed to put them in the same partition")); + +static cl::opt NoExternalizeGlobals( + "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, + cl::desc("disables externalization of global variable with local linkage; " + "may cause globals to be duplicated which increases binary size")); + +static cl::opt + LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, + cl::desc("output directory for AMDGPU module splitting logs")); + +static cl::opt + LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, + cl::desc("hash value names before printing them in the AMDGPU " + "module splitting logs")); + +using CostType = InstructionCost::CostType; +using PartitionID = unsigned; + +static bool isEntryPoint(const Function *F) { + return AMDGPU::isEntryFunctionCC(F->getCallingConv()); +} + +static std::string getName(const Value &V) { + static bool HideNames; + + static llvm::once_flag HideNameInitFlag; + llvm::call_once(HideNameInitFlag, [&]() { + if (LogPrivate.getNumOccurrences()) + HideNames = LogPrivate; + else { + const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); + HideNames = (EV.value_or("0") != "0"); + } + }); + + if (!HideNames) + return V.getName().str(); + return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), + /*LowerCase=*/true); +} + +/// Main logging helper. +/// +/// Logging can be configured by the following environment variable. +/// AMD_SPLIT_MODULE_LOG_DIR= +/// If set, uses as the directory to write logfiles to +/// each time module splitting is used. +/// AMD_SPLIT_MODULE_LOG_PRIVATE +/// If set to anything other than zero, all names are hidden. +/// +/// Both environment variables have corresponding CL options which +/// takes priority over them. +/// +/// Any output printed to the log files is also printed to dbgs() when -debug is +/// used and LLVM_DEBUG is defined. +/// +/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic +/// cannot be removed from the code (by building without debug). This probably +/// has a small performance cost because if some computation/formatting is +/// needed for logging purpose, it may be done everytime only to be ignored +/// by the logger. +/// +/// As this pass only runs once and is not doing anything computationally +/// expensive, this is likely a reasonable trade-off. +/// +/// If some computation should really be avoided when unused, users of the class +/// can check whether any logging will occur by using the bool operator. +/// +/// \code +/// if (SML) { +/// // Executes only if logging to a file or if -debug is available and +/// used. +/// } +/// \endcode +class SplitModuleLogger { +public: + SplitModuleLogger(const Module &M) { + std::string LogDir = LogDirOpt; + if (LogDir.empty()) + LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); + + // No log dir specified means we don't need to log to a file. + // We may still log to dbgs(), though. + if (LogDir.empty()) + return; + + // If a log directory is specified, create a new file with a unique name in + // that directory. + int Fd; + SmallString<0> PathTemplate; + SmallString<0> RealPath; + sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); + if (auto Err = + sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { + report_fatal_error("Failed to create log file at '" + Twine(LogDir) + + "': " + Err.message(), + /*CrashDiag=*/false); + } + + FileOS = std::make_unique(Fd, /*shouldClose=*/true); + } + + bool hasLogFile() const { return FileOS != nullptr; } + + raw_ostream &logfile() { + assert(FileOS && "no logfile!"); + return *FileOS; + } + + /// \returns true if this SML will log anything either to a file or dbgs(). + /// Can be used to avoid expensive computations that are ignored when logging + /// is disabled. + operator bool() const { + return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); + } + +private: + std::unique_ptr FileOS; +}; + +template +static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { + static_assert( + !std::is_same_v, + "do not print values to logs directly, use handleName instead!"); + LLVM_DEBUG(dbgs() << Val); + if (SML.hasLogFile()) + SML.logfile() << Val; + return SML; +} + +/// Calculate the cost of each function in \p M +/// \param SML Log Helper +/// \param TM TargetMachine instance used to retrieve TargetTransformInfo. +/// \param M Module to analyze. +/// \param CostMap[out] Resulting Function -> Cost map. +/// \return The module's total cost. +static CostType +calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, + Module &M, + DenseMap &CostMap) { + CostType ModuleCost = 0; + CostType KernelCost = 0; + + for (auto &Fn : M) { + if (Fn.isDeclaration()) + continue; + + CostType FnCost = 0; + TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn); + + for (const auto &BB : Fn) { + for (const auto &I : BB) { + auto Cost = + TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + assert(Cost != InstructionCost::getMax()); + // Assume expensive if we can't tell the cost of an instruction. + CostType CostVal = + Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); + assert((FnCost + CostVal) >= FnCost && "Overflow!"); + FnCost += CostVal; + } + } + + assert(FnCost != 0); + + CostMap[&Fn] = FnCost; + assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); + ModuleCost += FnCost; + + if (isEntryPoint(&Fn)) + KernelCost += FnCost; + } + + CostType FnCost = (ModuleCost - KernelCost); + SML << "=> Total Module Cost: " << ModuleCost << '\n' + << " => KernelCost: " << KernelCost << " (" + << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n" + << " => FnsCost: " << FnCost << " (" + << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n"; + + return ModuleCost; +} + +static bool canBeIndirectlyCalled(const Function &F) { + if (F.isDeclaration() || isEntryPoint(&F)) + return false; + return !F.hasLocalLinkage() || + F.hasAddressTaken(/*PutOffender=*/nullptr, + /*IgnoreCallbackUses=*/false, + /*IgnoreAssumeLikeCalls=*/true, + /*IgnoreLLVMUsed=*/true, + /*IgnoreARCAttachedCall=*/false, + /*IgnoreCastedDirectCall=*/true); +} + +/// When a kernel or any of its callees performs an indirect call, this function +/// takes over \ref addAllDependencies and adds all potentially callable +/// functions to \p Fns so they can be counted as dependencies of the kernel. +/// +/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the +/// presence of an indirect call, the function's resource usage is the same as +/// the most expensive function in the module. +/// \param M The module. +/// \param Fns[out] Resulting list of functions. +static void addAllIndirectCallDependencies(const Module &M, + DenseSet &Fns) { + for (const auto &Fn : M) { + if (canBeIndirectlyCalled(Fn)) + Fns.insert(&Fn); + } +} + +/// Adds the functions that \p Fn may call to \p Fns, then recurses into each +/// callee until all reachable functions have been gathered. +/// +/// \param SML Log Helper +/// \param CG Call graph for \p Fn's module. +/// \param Fn Current function to look at. +/// \param Fns[out] Resulting list of functions. +/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some +/// point, either in \p Fn or in one of the function it calls. When that +/// happens, we fall back to adding all callable functions inside \p Fn's module +/// to \p Fns. +static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, + const Function &Fn, + DenseSet &Fns, + bool &HadIndirectCall) { + assert(!Fn.isDeclaration()); + + const Module &M = *Fn.getParent(); + SmallVector WorkList({&Fn}); + while (!WorkList.empty()) { + const auto &CurFn = *WorkList.pop_back_val(); + assert(!CurFn.isDeclaration()); + + // Scan for an indirect call. If such a call is found, we have to + // conservatively assume this can call all non-entrypoint functions in the + // module. + + for (auto &CGEntry : *CG[&CurFn]) { + auto *CGNode = CGEntry.second; + auto *Callee = CGNode->getFunction(); + if (!Callee) { + // Functions have an edge towards CallsExternalNode if they're external + // declarations, or if they do an indirect call. As we only process + // definitions here, we know this means the function has an indirect + // call. We then have to conservatively assume this can call all + // non-entrypoint functions in the module. + if (CGNode != CG.getCallsExternalNode()) + continue; // this is another function-less node we don't care about. + + SML << "Indirect call detected in " << getName(CurFn) + << " - treating all non-entrypoint functions as " + "potential dependencies\n"; + + // TODO: Print an ORE as well ? + addAllIndirectCallDependencies(M, Fns); + HadIndirectCall = true; + return; + } + + if (Callee->isDeclaration()) + continue; + + auto [It, Inserted] = Fns.insert(Callee); + if (Inserted) + WorkList.push_back(Callee); + } + } +} + +/// Contains information about a kernel and its dependencies. +struct KernelWithDependencies { + KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap &FnCosts, + const Function *Fn) + : Fn(Fn) { + addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); + TotalCost = FnCosts.at(Fn); + for (const auto *Dep : Dependencies) { + TotalCost += FnCosts.at(Dep); + + // We cannot duplicate functions with external linkage, or functions that + // may be overriden at runtime. + HasNonDuplicatableDependecy |= + (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); + } + } + + const Function *Fn = nullptr; + DenseSet Dependencies; + /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. + bool HasIndirectCall = false; + /// Whether any of \p Fn's dependencies cannot be duplicated. + bool HasNonDuplicatableDependecy = false; + + CostType TotalCost = 0; + + /// \returns true if this kernel and its dependencies can be considered large + /// according to \p Threshold. + bool isLarge(CostType Threshold) const { + return TotalCost > Threshold && !Dependencies.empty(); + } +}; + +/// Calculates how much overlap there is between \p A and \p B. +/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A +/// and B have no shared elements. Kernels do not count in overlap calculation. +static float calculateOverlap(const DenseSet &A, + const DenseSet &B) { + DenseSet Total; + for (const auto *F : A) { + if (!isEntryPoint(F)) + Total.insert(F); + } + + if (Total.empty()) + return 0.0f; + + unsigned NumCommon = 0; + for (const auto *F : B) { + if (isEntryPoint(F)) + continue; + + auto [It, Inserted] = Total.insert(F); + if (!Inserted) + ++NumCommon; + } + + return static_cast(NumCommon) / Total.size(); +} + +/// Performs all of the partitioning work on \p M. +/// \param SML Log Helper +/// \param M Module to partition. +/// \param NumParts Number of partitions to create. +/// \param ModuleCost Total cost of all functions in \p M. +/// \param FnCosts Map of Function -> Cost +/// \param WorkList Kernels and their dependencies to process in order. +/// \returns The created partitions (a vector of size \p NumParts ) +static std::vector> +doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, + CostType ModuleCost, + const DenseMap &FnCosts, + const SmallVector &WorkList) { + + SML << "\n--Partitioning Starts--\n"; + + // Calculate a "large kernel threshold". When more than one kernel's total + // import cost exceeds this value, we will try to merge it with other, + // similarly large kernels. + // + // e.g. let two kernels X and Y have a import cost of ~10% of the module, we + // assign X to a partition as usual, but when we get to Y, we check if it's + // worth also putting it in Y's partition. + const CostType LargeKernelThreshold = + LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor) + : std::numeric_limits::max(); + + std::vector> Partitions; + Partitions.resize(NumParts); + + // Assign a partition to each kernel, and try to keep the partitions more or + // less balanced. We do that through a priority queue sorted in reverse, so we + // can always look at the partition with the least content. + // + // There are some cases where we will be deliberately unbalanced though. + // - Large kernels: we try to merge with existing partitions to reduce code + // duplication. + // - Kernels with indirect or external calls always go in the first partition + // (P0). + auto ComparePartitions = [](const std::pair &a, + const std::pair &b) { + // When two partitions have the same cost, assign to the one with the + // biggest ID first. This allows us to put things in P0 last, because P0 may + // have other stuff added later. + if (a.second == b.second) + return a.first < b.first; + return a.second > b.second; + }; + + // We can't use priority_queue here because we need to be able to access any + // element. This makes this a bit inefficient as we need to sort it again + // everytime we change it, but it's a very small array anyway (likely under 64 + // partitions) so it's a cheap operation. + std::vector> BalancingQueue; + for (unsigned I = 0; I < NumParts; ++I) + BalancingQueue.push_back(std::make_pair(I, 0)); + + // Helper function to handle assigning a kernel to a partition. This takes + // care of updating the balancing queue. + const auto AssignToPartition = [&](PartitionID PID, + const KernelWithDependencies &KWD) { + auto &FnsInPart = Partitions[PID]; + FnsInPart.insert(KWD.Fn); + FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); + + SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; + if (!KWD.Dependencies.empty()) { + SML << KWD.Dependencies.size() << " dependencies added\n"; + }; + + // Update the balancing queue. we scan backwards because in the common case + // the partition is at the end. + for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { + if (QueuePID == PID) { + CostType NewCost = 0; + for (auto *Fn : Partitions[PID]) + NewCost += FnCosts.at(Fn); + + SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; + if (Cost) { + SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) + << "% increase)"; + } + SML << '\n'; + + Cost = NewCost; + } + } + + sort(BalancingQueue, ComparePartitions); + }; + + for (auto &CurKernel : WorkList) { + // When a kernel has indirect calls, it must stay in the first partition + // alongside every reachable non-entry function. This is a nightmare case + // for splitting as it severely limits what we can do. + if (CurKernel.HasIndirectCall) { + SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) + << " defaulting to P0\n"; + AssignToPartition(0, CurKernel); + continue; + } + + // When a kernel has non duplicatable dependencies, we have to keep it in + // the first partition as well. This is a conservative approach, a + // finer-grained approach could keep track of which dependencies are + // non-duplicatable exactly and just make sure they're grouped together. + if (CurKernel.HasNonDuplicatableDependecy) { + SML << "Kernel with externally visible dependency " + << getName(*CurKernel.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurKernel); + continue; + } + + // Be smart with large kernels to avoid duplicating their dependencies. + if (CurKernel.isLarge(LargeKernelThreshold)) { + assert(LargeKernelOverlapForMerge >= 0.0f && + LargeKernelOverlapForMerge <= 1.0f); + SML << "Large Kernel: " << getName(*CurKernel.Fn) + << " - looking for partition with at least " + << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; + + bool Assigned = false; + for (const auto &[PID, Fns] : enumerate(Partitions)) { + float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); + SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" + << PID << '\n'; + if (Overlap > LargeKernelOverlapForMerge) { + SML << " selecting P" << PID << '\n'; + AssignToPartition(PID, CurKernel); + Assigned = true; + } + } + + if (Assigned) + continue; + } + + // Normal "load-balancing", assign to partition with least pressure. + auto [PID, CurCost] = BalancingQueue.back(); + AssignToPartition(PID, CurKernel); + } + + // Work is mostly done now, verify the partioning and add all functions we may + // have missed (= unreachable, or we don't understand how they're reached) to + // P0. + DenseSet AllFunctions; + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) { + // external linkage functions should exclusively be in the first partition + // at this stage. In theory, we should only ever see external linkage + // functions here if they're kernels, or if they've been added due to a + // kernel using indirect calls somewhere in its CallGraph. + assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); + Cost += FnCosts.at(Fn); + } + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCost) * 100) + << "% of source module)\n"; + AllFunctions.insert(Part.begin(), Part.end()); + } + + // Add missed functions to P0. This will take care of adding things like + // external functions with no callers in the module to P0. This should be + // fairly rare as AMDGPU internalizes everything in most cases, so unused + // internal functions would get removed. + for (auto &Fn : M) { + if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { + SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; + Partitions[0].insert(&Fn); + } + } + + SML << "--Partitioning Done--\n\n"; + + return Partitions; +} + +static void externalize(GlobalValue &GV) { + if (GV.hasLocalLinkage()) { + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setVisibility(GlobalValue::HiddenVisibility); + } + + // Unnamed entities must be named consistently between modules. setName will + // give a distinct name to each such entity. + if (!GV.hasName()) + GV.setName("__llvmsplit_unnamed"); +} +} // end anonymous namespace + +void llvm::splitAMDGPUModule( + const AMDGPUTargetMachine &TM, Module &M, unsigned N, + function_ref MPart)> ModuleCallback) { + + SplitModuleLogger SML(M); + + CallGraph CG(M); + + // Externalize functions whose address are taken. + // + // This is needed because partitioning is purely based on calls, but sometimes + // a kernel/function may just look at the address of another local function + // and not do anything (no calls). After partitioning, that local function may + // end up in a different module (so it's just a declaration in the module + // where its address is taken), which emits a "undefined hidden symbol" linker + // error. + // + // Additionally, it guides partitioning to not duplicate this function if it's + // called directly at some point. + for (auto &Fn : M) { + if (Fn.hasAddressTaken()) { + if (Fn.hasLocalLinkage()) { + SML << "[externalize] " << Fn.getName() + << " because its address is taken\n"; + } + externalize(Fn); + } + } + + // Externalize local GVs, which avoids duplicating their initializers, which + // in turns helps keep code size in check. + if (!NoExternalizeGlobals) { + for (auto &GV : M.globals()) { + if (GV.hasLocalLinkage()) + SML << "[externalize] GV " << GV.getName() << '\n'; + externalize(GV); + } + } + + // Start by calculating the cost of every function in the module, as well as + // the module's overall cost. + DenseMap FnCosts; + const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts); + + // Gather every kernel into a WorkList, then sort it by descending total cost + // of the kernel so the biggest kernels are seen first. + SmallVector WorkList; + for (auto &Fn : M) { + if (isEntryPoint(&Fn) && !Fn.isDeclaration()) + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + sort(WorkList, [&](auto &A, auto &B) { + // Sort by total cost, and if the total cost is identical, sort + // alphabetically. + if (A.TotalCost == B.TotalCost) + return A.Fn->getName() < B.Fn->getName(); + return A.TotalCost > B.TotalCost; + }); + + if (SML) { + SML << "Worklist\n"; + for (const auto &KWD : WorkList) { + SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost + << " indirect:" << KWD.HasIndirectCall + << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy + << ")\n"; + for (const auto *Dep : KWD.Dependencies) + SML << " [Dep] " << getName(*Dep) << '\n'; + } + } + + // This performs all of the partitioning work. + auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); + assert(Partitions.size() == N); + + // If we didn't externalize GVs, then local GVs need to be conservatively + // imported into every module (including their initializers), and then cleaned + // up afterwards. + const auto NeedsConservativeImport = [&](const GlobalValue *GV) { + // We conservatively import private/internal GVs into every module and clean + // them up afterwards. + const auto *Var = dyn_cast(GV); + return Var && Var->hasLocalLinkage(); + }; + + SML << "Creating " << N << " modules...\n"; + unsigned TotalFnImpls = 0; + for (unsigned I = 0; I < N; ++I) { + const auto &FnsInPart = Partitions[I]; + + ValueToValueMapTy VMap; + std::unique_ptr MPart( + CloneModule(M, VMap, [&](const GlobalValue *GV) { + // Functions go in their assigned partition. + if (const auto *Fn = dyn_cast(GV)) { +// Check we don't import an external linkage function in any +// partition other than P0. +#ifndef NDEBUG + if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { + assert((I == 0) == FnsInPart.contains(Fn)); + } +#endif + return FnsInPart.contains(Fn); + } + + if (NeedsConservativeImport(GV)) + return true; + + // Everything else goes in the first partition. + return I == 0; + })); + + // Clean-up conservatively imported GVs without any users. + for (auto &GV : make_early_inc_range(MPart->globals())) { + if (NeedsConservativeImport(&GV) && GV.use_empty()) + GV.eraseFromParent(); + } + + unsigned NumAllFns = 0, NumKernels = 0; + for (auto &Cur : *MPart) { + if (!Cur.isDeclaration()) { + ++NumAllFns; + if (isEntryPoint(&Cur)) + ++NumKernels; + } + } + TotalFnImpls += NumAllFns; + SML << " - Module " << I << " with " << NumAllFns << " functions (" + << NumKernels << " kernels)\n"; + ModuleCallback(std::move(MPart)); + } + + SML << TotalFnImpls << " function definitions across all modules (" + << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) + << "% of original module)\n"; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h new file mode 100644 index 0000000000000..6171643bd4adc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h @@ -0,0 +1,30 @@ +//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H +#define LLVM_TARGET_AMDGPUSPLITMODULE_H + +#include "llvm/ADT/STLFunctionalExtras.h" +#include + +namespace llvm { + +class Module; +class AMDGPUTargetMachine; + +/// Splits the module M into N linkable partitions. The function ModuleCallback +/// is called N times passing each individual partition as the MPart argument. +void splitAMDGPUModule( + const AMDGPUTargetMachine &TM, Module &M, unsigned N, + function_ref MPart)> ModuleCallback); + +} // end namespace llvm + +#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 305a6c8c3b926..72d77f0a2f706 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -20,6 +20,7 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPURegBankSelect.h" +#include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" @@ -806,6 +807,13 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { return AMDGPUAS::FLAT_ADDRESS; } +bool AMDGPUTargetMachine::splitModule( + Module &M, unsigned NumParts, + function_ref MPart)> ModuleCallback) const { + splitAMDGPUModule(*this, M, NumParts, ModuleCallback); + return true; +} + //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 30ab388c7d52e..5f6233d4378c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -67,6 +67,10 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { getPredicatedAddrSpace(const Value *V) const override; unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; + + bool splitModule(Module &M, unsigned NumParts, + function_ref MPart)> + ModuleCallback) const override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 48325a0928f93..d991cc5b1b79d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -97,6 +97,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp AMDGPUSetWavePriority.cpp + AMDGPUSplitModule.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll new file mode 100644 index 0000000000000..8b76237efa325 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll @@ -0,0 +1,46 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels: +; - A does a direct call to HelperA +; - B is storing @HelperA +; - C does a direct call to HelperA +; +; The helper functions will get externalized, which will force A and C into P0 as +; external functions cannot be duplicated. + +; CHECK0: define hidden void @HelperA() +; CHECK0: define amdgpu_kernel void @A() +; CHECK0: declare amdgpu_kernel void @B(ptr) +; CHECK0: define amdgpu_kernel void @C() + +; CHECK1: declare hidden void @HelperA() +; CHECK1: declare amdgpu_kernel void @A() +; CHECK1: declare amdgpu_kernel void @B(ptr) +; CHECK1: declare amdgpu_kernel void @C() + +; CHECK2: declare hidden void @HelperA() +; CHECK2: declare amdgpu_kernel void @A() +; CHECK2: define amdgpu_kernel void @B(ptr %dst) +; CHECK2: declare amdgpu_kernel void @C() + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B(ptr %dst) { + store ptr @HelperA, ptr %dst + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperA() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll new file mode 100644 index 0000000000000..46d7d9783aeae --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll @@ -0,0 +1,37 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s + +; 2 kernels: +; - A is isolated +; - B is storing @HelperA/B's address +; +; The helper functions should get externalized (become hidden w/ external linkage) + +; CHECK0: define hidden void @HelperA() +; CHECK0: define hidden void @HelperB() +; CHECK0: define amdgpu_kernel void @A() +; CHECK0: declare amdgpu_kernel void @B(i1, ptr) + +; CHECK1: declare hidden void @HelperA() +; CHECK1: declare hidden void @HelperB() +; CHECK1: declare amdgpu_kernel void @A() +; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst) + +define internal void @HelperA() { + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B(i1 %cond, ptr %dst) { + %addr = select i1 %cond, ptr @HelperA, ptr @HelperB + store ptr %addr, ptr %dst + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll new file mode 100644 index 0000000000000..6a07ed51ba1be --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll @@ -0,0 +1,20 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel +; REQUIRES: asserts + +; SHA256 of the kernel names. + +; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c +; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59 +; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55 + +define amdgpu_kernel void @MyCustomKernel0() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel1() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel2() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll new file mode 100644 index 0000000000000..c2746d1398924 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll @@ -0,0 +1,45 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s + +; 3 kernels: +; - A calls nothing +; - B calls @PerryThePlatypus +; - C calls @Perry, an alias of @PerryThePlatypus +; +; We should see through the alias and put B/C in the same +; partition. +; +; Additionally, @PerryThePlatypus gets externalized as +; the alias counts as taking its address. + +; CHECK0-NOT: define +; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus +; CHECK0: define hidden void @PerryThePlatypus() +; CHECK0: define amdgpu_kernel void @B +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +@Perry = internal alias ptr(), ptr @PerryThePlatypus + +define internal void @PerryThePlatypus() { + ret void +} + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B() { + call void @PerryThePlatypus() + ret void +} + +define amdgpu_kernel void @C() { + call void @Perry() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll new file mode 100644 index 0000000000000..4635264aefb39 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll @@ -0,0 +1,54 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels with each their own dependencies should go into 3 +; distinct partitions. The most expensive kernel should be +; seen first and go into the last partition. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define internal void @HelperC +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @A +; CHECK1: define internal void @HelperA +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @B +; CHECK2: define internal void @HelperB +; CHECK2-NOT: define + + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @B(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + call void @HelperB() + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define internal void @HelperC() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll new file mode 100644 index 0000000000000..bea527f15bbaa --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll @@ -0,0 +1,50 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels with each their own dependencies should go into 3 +; distinct partitions. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define internal void @HelperC +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @B +; CHECK1: define internal void @HelperB +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define internal void @HelperA +; CHECK2-NOT: define + + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @B() { + call void @HelperB() + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define internal void @HelperC() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll new file mode 100644 index 0000000000000..64839f8d8456a --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll @@ -0,0 +1,41 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels share a common helper, that helper should be +; cloned in all partitions. + +; CHECK0-NOT: define +; CHECK0: define internal void @Helper +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @Helper +; CHECK1: define amdgpu_kernel void @B +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @Helper +; CHECK2: define amdgpu_kernel void @A +; CHECK2-NOT: define + +define internal void @Helper() { + ret void +} + +define amdgpu_kernel void @A() { + call void @Helper() + ret void +} + +define amdgpu_kernel void @B() { + call void @Helper() + ret void +} + +define amdgpu_kernel void @C() { + call void @Helper() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll new file mode 100644 index 0000000000000..435e97a581340 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll @@ -0,0 +1,64 @@ +; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s +; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s + +; Both overridable helper should go in P0. + +; CHECK0-NOT: define +; CHECK0: define available_externally void @OverridableHelper0() +; CHECK0: define internal void @OverridableHelper1() +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @PrivateHelper1() +; CHECK2: define amdgpu_kernel void @D +; CHECK2-NOT: define + +; CHECK3-NOT: define +; CHECK3: define internal void @PrivateHelper0() +; CHECK3: define amdgpu_kernel void @C +; CHECK3-NOT: define + +define available_externally void @OverridableHelper0() { + ret void +} + +define internal void @OverridableHelper1() #0 { + ret void +} + +define internal void @PrivateHelper0() { + ret void +} + +define internal void @PrivateHelper1() { + ret void +} + +define amdgpu_kernel void @A() { + call void @OverridableHelper0() + ret void +} + +define amdgpu_kernel void @B() { + call void @OverridableHelper1() + ret void +} + +define amdgpu_kernel void @C() { + call void @PrivateHelper0() + ret void +} + +define amdgpu_kernel void @D() { + call void @PrivateHelper1() + ret void +} + +attributes #0 = { nobuiltin } diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll new file mode 100644 index 0000000000000..9701ac35ce54e --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll @@ -0,0 +1,76 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; We have 4 kernels: +; - Each kernel has an internal helper +; - @A and @B's helpers does an indirect call. +; +; We default to putting A/B in P0, alongside a copy +; of all helpers who have their address taken. +; The other kernels can still go into separate partitions. + +; CHECK0-NOT: define +; CHECK0: define hidden void @HelperA +; CHECK0: define hidden void @HelperB +; CHECK0: define hidden void @CallCandidate +; CHECK0-NOT: define {{.*}} @HelperC +; CHECK0-NOT: define {{.*}} @HelperD +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @HelperD +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @HelperC +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define + +@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] + +define internal void @HelperA(ptr %call) { + call void %call() + ret void +} + +define internal void @HelperB(ptr %call) { + call void %call() + ret void +} + +define internal void @CallCandidate() { + ret void +} + +define internal void @HelperC() { + ret void +} + +define internal void @HelperD() { + ret void +} + +define amdgpu_kernel void @A(ptr %call) { + call void @HelperA(ptr %call) + ret void +} + +define amdgpu_kernel void @B(ptr %call) { + call void @HelperB(ptr %call) + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define amdgpu_kernel void @D() { + call void @HelperD() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll new file mode 100644 index 0000000000000..dc2c5c3c07bee --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll @@ -0,0 +1,40 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; CHECK0-NOT: define +; CHECK0: define void @ExternalHelper +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define + +define void @ExternalHelper() { + ret void +} + +define amdgpu_kernel void @A() { + call void @ExternalHelper() + ret void +} + +define amdgpu_kernel void @B() { + call void @ExternalHelper() + ret void +} + +define amdgpu_kernel void @C() { + ret void +} + +define amdgpu_kernel void @D() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll new file mode 100644 index 0000000000000..0fc76934afc54 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll @@ -0,0 +1,42 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels use private/internal global variables. +; The GVs should be copied in each partition as needed. + +; CHECK0-NOT: define +; CHECK0: @bar = internal constant ptr +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: @foo = private constant ptr +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: @foo = private constant ptr +; CHECK2: @bar = internal constant ptr +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +@foo = private constant ptr poison +@bar = internal constant ptr poison + +define amdgpu_kernel void @A() { + store i32 42, ptr @foo + ret void +} + +define amdgpu_kernel void @B() { + store i32 42, ptr @foo + store i32 42, ptr @bar + ret void +} + +define amdgpu_kernel void @C() { + store i32 42, ptr @bar + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll new file mode 100644 index 0000000000000..7564662e7c7c0 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll @@ -0,0 +1,44 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels use private/internal global variables. +; The GVs should be copied in each partition as needed. + +; CHECK0-NOT: define +; CHECK0: @foo = hidden constant ptr poison +; CHECK0: @bar = hidden constant ptr poison +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: @foo = external hidden constant ptr{{$}} +; CHECK1: @bar = external hidden constant ptr{{$}} +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: @foo = external hidden constant ptr{{$}} +; CHECK2: @bar = external hidden constant ptr{{$}} +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +@foo = private constant ptr poison +@bar = internal constant ptr poison + +define amdgpu_kernel void @A() { + store i32 42, ptr @foo + ret void +} + +define amdgpu_kernel void @B() { + store i32 42, ptr @foo + store i32 42, ptr @bar + ret void +} + +define amdgpu_kernel void @C() { + store i32 42, ptr @bar + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll new file mode 100644 index 0000000000000..5dfb95c5fc660 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll @@ -0,0 +1,75 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; Test load balancing logic with 6 kernels. +; +; Kernels go from most expensive (A == 6) to least expensive (F == 1) +; +; Load balancing should work like this (current partition cost is in parens) +; +; Initial -> [P0(0), P1(0), P2(0)] +; +; A(6) goes in 2 -> [P2(6), P0(0), P1(0)] +; B(5) goes in 1 -> [P2(6), P1(5), P0(4)] +; C(4) goes in 0 -> [P2(6), P1(5), P0(4)] + +; D(3) goes in 0 -> [P0(7), P2(6), P1(5)] +; E(2) goes in 1 -> [P0(7), P1(7), P2(6)] +; F(1) goes in 2 -> [P0(7), P1(7), P2(7)] + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define amdgpu_kernel void @D +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @B +; CHECK1: define amdgpu_kernel void @E +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define amdgpu_kernel void @F +; CHECK2-NOT: define + + +define amdgpu_kernel void @A(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + store i64 45, ptr %x + store i64 46, ptr %x + ret void +} + +define amdgpu_kernel void @B(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + store i64 45, ptr %x + ret void +} + +define amdgpu_kernel void @C(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + ret void +} + +define amdgpu_kernel void @D(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + ret void +} + +define amdgpu_kernel void @E(ptr %x) { + store i64 42, ptr %x + ret void +} + +define amdgpu_kernel void @F() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll new file mode 100644 index 0000000000000..8959acfcae542 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll @@ -0,0 +1,39 @@ +; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s +; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s + +; Check that 4 independent kernels get put into 4 different partitions. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @D +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +; CHECK3-NOT: define +; CHECK3: define amdgpu_kernel void @A +; CHECK3-NOT: define + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B() { + ret void +} + +define amdgpu_kernel void @C() { + ret void +} + +define amdgpu_kernel void @D() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll new file mode 100644 index 0000000000000..4fdbac7d17897 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll @@ -0,0 +1,98 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s +; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s +; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s + +; 2 kernels (A/B) are large and share all their dependencies. +; They should go in the same partition, the remaining kernel should +; go somewhere else, and one partition should be empty. +; +; Also check w/o large kernels processing to verify they are indeed handled +; differently. + +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @HelperC() +; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @large2() +; CHECK2: define internal void @large1() +; CHECK2: define internal void @large0() +; CHECK2: define internal void @HelperA() +; CHECK2: define internal void @HelperB() +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +; NOLARGEKERNELS-CHECK0-NOT: define +; NOLARGEKERNELS-CHECK0: define internal void @HelperC() +; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C +; NOLARGEKERNELS-CHECK0-NOT: define + +; NOLARGEKERNELS-CHECK1: define internal void @large2() +; NOLARGEKERNELS-CHECK1: define internal void @large1() +; NOLARGEKERNELS-CHECK1: define internal void @large0() +; NOLARGEKERNELS-CHECK1: define internal void @HelperB() +; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B + +; NOLARGEKERNELS-CHECK2: define internal void @large2() +; NOLARGEKERNELS-CHECK2: define internal void @large1() +; NOLARGEKERNELS-CHECK2: define internal void @large0() +; NOLARGEKERNELS-CHECK2: define internal void @HelperA() +; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A + +define internal void @large2() { + store volatile i32 42, ptr null + call void @large2() + ret void +} + +define internal void @large1() { + call void @large1() + call void @large2() + ret void +} + +define internal void @large0() { + call void @large0() + call void @large1() + call void @large2() + ret void +} + +define internal void @HelperA() { + call void @large0() + ret void +} + +define internal void @HelperB() { + call void @large0() + ret void +} + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B() { + call void @HelperB() + ret void +} + +define internal void @HelperC() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg new file mode 100644 index 0000000000000..6154a6c1c9061 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AMDGPU" in config.root.targets: + config.unsupported = True