diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 8f0a17cf99967..94913f534fb77 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -44,10 +44,12 @@ #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/WindowScheduler.h" #include "llvm/InitializePasses.h" #include @@ -107,6 +109,9 @@ class MachinePipeliner : public MachineFunctionPass { bool scheduleLoop(MachineLoop &L); bool swingModuloScheduler(MachineLoop &L); void setPragmaPipelineOptions(MachineLoop &L); + bool runWindowScheduler(MachineLoop &L); + bool useSwingModuloScheduler(); + bool useWindowScheduler(bool Changed); }; /// This class builds the dependence graph for the instructions in a loop, @@ -449,7 +454,7 @@ class ResourceManager { const MCSchedModel &SM; const TargetSubtargetInfo *ST; const TargetInstrInfo *TII; - SwingSchedulerDAG *DAG; + ScheduleDAGInstrs *DAG; const bool UseDFA; /// DFA resources for each slot llvm::SmallVector> DFAResources; @@ -493,7 +498,7 @@ class ResourceManager { #endif public: - ResourceManager(const TargetSubtargetInfo *ST, SwingSchedulerDAG *DAG) + ResourceManager(const TargetSubtargetInfo *ST, ScheduleDAGInstrs *DAG) : STI(ST), SM(ST->getSchedModel()), ST(ST), TII(ST->getInstrInfo()), DAG(DAG), UseDFA(ST->useDFAforSMS()), ProcResourceMasks(SM.getNumProcResourceKinds(), 0), diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index 7f8ed5c501989..557249f9dbde1 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -199,6 +199,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// True if the subtarget should run MachinePipeliner virtual bool enableMachinePipeliner() const { return true; }; + /// True if the subtarget should run WindowScheduler. + virtual bool enableWindowScheduler() const { return true; } + /// True if the subtarget should enable joining global copies. /// /// By default this is enabled if the machine scheduler is enabled, but diff --git a/llvm/include/llvm/CodeGen/WindowScheduler.h b/llvm/include/llvm/CodeGen/WindowScheduler.h new file mode 100644 index 0000000000000..476d5ada27876 --- /dev/null +++ b/llvm/include/llvm/CodeGen/WindowScheduler.h @@ -0,0 +1,171 @@ +//======----------- WindowScheduler.cpp - window scheduler -------------======// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// An implementation of the Window Scheduling software pipelining algorithm. +// +// The concept of the window algorithm was first unveiled in Steven Muchnick's +// book, "Advanced Compiler Design And Implementation", and later elaborated +// upon in Venkatraman Govindaraju's report, "Implementation of Software +// Pipelining Using Window Scheduling". +// +// The window algorithm can be perceived as a modulo scheduling algorithm with a +// stage count of 2. It boasts a higher scheduling success rate in targets with +// severe resource conflicts when compared to the classic Swing Modulo +// Scheduling (SMS) algorithm. To align with the LLVM scheduling framework, we +// have enhanced the original window algorithm. The primary steps are as +// follows: +// +// 1. Instead of duplicating the original MBB twice as mentioned in the +// literature, we copy it three times, generating TripleMBB and the +// corresponding TripleDAG. +// +// 2. We establish a scheduling window on TripleMBB and execute list scheduling +// within it. +// +// 3. After multiple list scheduling, we select the best outcome and expand it +// into the final scheduling result. +// +// To cater to the needs of various targets, we have developed the window +// scheduler in a form that is easily derivable. We recommend employing this +// algorithm in targets with severe resource conflicts, and it can be utilized +// either before or after the Register Allocator (RA). +// +// The default implementation provided here is before RA. If it is to be used +// after RA, certain critical algorithm functions will need to be derived. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CODEGEN_WINDOWSCHEDULER_H +#define LLVM_CODEGEN_WINDOWSCHEDULER_H + +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" + +namespace llvm { + +enum WindowSchedulingFlag { + WS_Off, /// Turn off window algorithm. + WS_On, /// Use window algorithm after SMS algorithm fails. + WS_Force /// Use window algorithm instead of SMS algorithm. +}; + +/// The main class in the implementation of the target independent window +/// scheduler. +class WindowScheduler { +protected: + MachineSchedContext *Context = nullptr; + MachineFunction *MF = nullptr; + MachineBasicBlock *MBB = nullptr; + MachineLoop &Loop; + const TargetSubtargetInfo *Subtarget = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + + /// To innovatively identify the dependencies between MIs across two trips, we + /// construct a DAG for a new MBB, which is created by copying the original + /// MBB three times. We refer to this new MBB as 'TripleMBB' and the + /// corresponding DAG as 'TripleDAG'. + /// If the dependencies are more than two trips, we avoid applying window + /// algorithm by identifying successive phis in the old MBB. + std::unique_ptr TripleDAG; + /// OriMIs keeps the MIs removed from the original MBB. + SmallVector OriMIs; + /// TriMIs keeps the MIs of TripleMBB, which is used to restore TripleMBB. + SmallVector TriMIs; + /// TriToOri keeps the mappings between the MI clones in TripleMBB and their + /// original MI. + DenseMap TriToOri; + /// OriToCycle keeps the mappings between the original MI and its issue cycle. + DenseMap OriToCycle; + /// SchedResult keeps the result of each list scheduling, and the format of + /// the tuple is . + SmallVector, 256> SchedResult; + /// SchedPhiNum records the number of phi in the original MBB, and the + /// scheduling starts with MI after phis. + unsigned SchedPhiNum = 0; + /// SchedInstrNum records the MIs involved in scheduling in the original MBB, + /// excluding debug instructions. + unsigned SchedInstrNum = 0; + /// BestII and BestOffset record the characteristics of the best scheduling + /// result and are used together with SchedResult as the final window + /// scheduling result. + unsigned BestII = UINT_MAX; + unsigned BestOffset = 0; + /// BaseII is the II obtained when the window offset is SchedPhiNum. This + /// offset is the initial position of the sliding window. + unsigned BaseII = 0; + +public: + WindowScheduler(MachineSchedContext *C, MachineLoop &ML); + virtual ~WindowScheduler() {} + + bool run(); + +protected: + /// Two types of ScheduleDAGs are needed, one for creating dependency graphs + /// only, and the other for list scheduling as determined by the target. + virtual ScheduleDAGInstrs * + createMachineScheduler(bool OnlyBuildGraph = false); + /// Initializes the algorithm and determines if it can be executed. + virtual bool initialize(); + /// Add some related processing before running window scheduling. + virtual void preProcess(); + /// Add some related processing after running window scheduling. + virtual void postProcess(); + /// Back up the MIs in the original MBB and remove them from MBB. + void backupMBB(); + /// Erase the MIs in current MBB and restore the original MIs. + void restoreMBB(); + /// Make three copies of the original MBB to generate a new TripleMBB. + virtual void generateTripleMBB(); + /// Restore the order of MIs in TripleMBB after each list scheduling. + virtual void restoreTripleMBB(); + /// Give the folding position in the window algorithm, where different + /// heuristics can be used. It determines the performance and compilation time + /// of the algorithm. + virtual SmallVector getSearchIndexes(unsigned SearchNum, + unsigned SearchRatio); + /// Calculate MIs execution cycle after list scheduling. + virtual int calculateMaxCycle(ScheduleDAGInstrs &DAG, unsigned Offset); + /// Calculate the stall cycle between two trips after list scheduling. + virtual int calculateStallCycle(unsigned Offset, int MaxCycle); + /// Analyzes the II value after each list scheduling. + virtual unsigned analyseII(ScheduleDAGInstrs &DAG, unsigned Offset); + /// Phis are scheduled separately after each list scheduling. + virtual void schedulePhi(int Offset, unsigned &II); + /// Get the final issue order of all scheduled MIs including phis. + DenseMap getIssueOrder(unsigned Offset, unsigned II); + /// Update the scheduling result after each list scheduling. + virtual void updateScheduleResult(unsigned Offset, unsigned II); + /// Check whether the final result of window scheduling is valid. + virtual bool isScheduleValid() { return BestOffset != SchedPhiNum; } + /// Using the scheduling infrastructure to expand the results of window + /// scheduling. It is usually necessary to add prologue and epilogue MBBs. + virtual void expand(); + /// Update the live intervals for all registers used within MBB. + virtual void updateLiveIntervals(); + /// Estimate a II value at which all MIs will be scheduled successfully. + int getEstimatedII(ScheduleDAGInstrs &DAG); + /// Gets the iterator range of MIs in the scheduling window. + iterator_range getScheduleRange(unsigned Offset, + unsigned Num); + /// Get the issue cycle of the new MI based on the cycle of the original MI. + int getOriCycle(MachineInstr *NewMI); + /// Get the original MI from which the new MI is cloned. + MachineInstr *getOriMI(MachineInstr *NewMI); + /// Get the scheduling stage, where the stage of the new MI is identical to + /// the original MI. + unsigned getOriStage(MachineInstr *OriMI, unsigned Offset); + /// Gets the register in phi which is generated from the current MBB. + Register getAntiRegister(MachineInstr *Phi); +}; +} // namespace llvm +#endif diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 2c24de60edd43..d8780718669d0 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -244,6 +244,7 @@ add_llvm_component_library(LLVMCodeGen VLIWMachineScheduler.cpp VirtRegMap.cpp WasmEHPrepare.cpp + WindowScheduler.cpp WinEHPrepare.cpp XRayInstrumentation.cpp ${GeneratedMLSources} diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index d8cb681688339..7e88f68fa3fa3 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -68,6 +68,7 @@ #include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/llvm-config.h" @@ -206,6 +207,17 @@ cl::opt SwpForceIssueWidth( cl::desc("Force pipeliner to use specified issue width."), cl::Hidden, cl::init(-1)); +/// A command line argument to set the window scheduling option. +cl::opt WindowSchedulingOption( + "window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), + cl::desc("Set how to use window scheduling algorithm."), + cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", + "Turn off window algorithm."), + clEnumValN(WindowSchedulingFlag::WS_On, "on", + "Use window algorithm after SMS algorithm fails."), + clEnumValN(WindowSchedulingFlag::WS_Force, "force", + "Use window algorithm instead of SMS algorithm."))); + } // end namespace llvm unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5; @@ -292,8 +304,11 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { } ++NumTrytoPipeline; + if (useSwingModuloScheduler()) + Changed = swingModuloScheduler(L); - Changed = swingModuloScheduler(L); + if (useWindowScheduler(Changed)) + Changed = runWindowScheduler(L); LI.LoopPipelinerInfo.reset(); return Changed; @@ -484,9 +499,35 @@ void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } +bool MachinePipeliner::runWindowScheduler(MachineLoop &L) { + MachineSchedContext Context; + Context.MF = MF; + Context.MLI = MLI; + Context.MDT = MDT; + Context.PassConfig = &getAnalysis(); + Context.AA = &getAnalysis().getAAResults(); + Context.LIS = &getAnalysis(); + Context.RegClassInfo->runOnMachineFunction(*MF); + WindowScheduler WS(&Context, L); + return WS.run(); +} + +bool MachinePipeliner::useSwingModuloScheduler() { + // SwingModuloScheduler does not work when WindowScheduler is forced. + return WindowSchedulingOption != WindowSchedulingFlag::WS_Force; +} + +bool MachinePipeliner::useWindowScheduler(bool Changed) { + // WindowScheduler does not work when it is off or when SwingModuloScheduler + // is successfully scheduled. + return WindowSchedulingOption == WindowSchedulingFlag::WS_Force || + (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed); +} + void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) { if (SwpForceII > 0) MII = SwpForceII; diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp new file mode 100644 index 0000000000000..7a7e4ca979925 --- /dev/null +++ b/llvm/lib/CodeGen/WindowScheduler.cpp @@ -0,0 +1,688 @@ +//======----------- WindowScheduler.cpp - window scheduler -------------======// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// An implementation of the Window Scheduling software pipelining algorithm. +// +// The fundamental concept of the window scheduling algorithm involves folding +// the original MBB at a specific position, followed by list scheduling on the +// folded MIs. The optimal scheduling result is then chosen from various folding +// positions as the final scheduling outcome. +// +// The primary challenge in this algorithm lies in generating the folded MIs and +// establishing their dependencies. We have innovatively employed a new MBB, +// created by copying the original MBB three times, known as TripleMBB. This +// TripleMBB enables the convenient implementation of MI folding and dependency +// establishment. To facilitate the algorithm's implementation, we have also +// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. +// +// Another challenge in the algorithm is the scheduling of phis. Semantically, +// it is difficult to place the phis in the window and perform list scheduling. +// Therefore, we schedule these phis separately after each list scheduling. +// +// The provided implementation is designed for use before the Register Allocator +// (RA). If the target requires implementation after RA, it is recommended to +// reimplement analyseII(), schedulePhi(), and expand(). Additionally, +// target-specific logic can be added in initialize(), preProcess(), and +// postProcess(). +// +// Lastly, it is worth mentioning that getSearchIndexes() is an important +// function. We have experimented with more complex heuristics on downstream +// target and achieved favorable results. +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/WindowScheduler.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePipeliner.h" +#include "llvm/CodeGen/ModuloSchedule.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TimeProfiler.h" + +using namespace llvm; + +#define DEBUG_TYPE "pipeliner" + +namespace { +STATISTIC(NumTryWindowSchedule, + "Number of loops that we attempt to use window scheduling"); +STATISTIC(NumTryWindowSearch, + "Number of times that we run list schedule in the window scheduling"); +STATISTIC(NumWindowSchedule, + "Number of loops that we successfully use window scheduling"); +STATISTIC(NumFailAnalyseII, + "Window scheduling abort due to the failure of the II analysis"); + +cl::opt + WindowSearchNum("window-search-num", + cl::desc("The number of searches per loop in the window " + "algorithm. 0 means no search number limit."), + cl::Hidden, cl::init(6)); + +cl::opt WindowSearchRatio( + "window-search-ratio", + cl::desc("The ratio of searches per loop in the window algorithm. 100 " + "means search all positions in the loop, while 0 means not " + "performing any search."), + cl::Hidden, cl::init(40)); + +cl::opt WindowIICoeff( + "window-ii-coeff", + cl::desc( + "The coefficient used when initializing II in the window algorithm."), + cl::Hidden, cl::init(5)); + +cl::opt WindowRegionLimit( + "window-region-limit", + cl::desc( + "The lower limit of the scheduling region in the window algorithm."), + cl::Hidden, cl::init(3)); + +cl::opt WindowDiffLimit( + "window-diff-limit", + cl::desc("The lower limit of the difference between best II and base II in " + "the window algorithm. If the difference is smaller than " + "this lower limit, window scheduling will not be performed."), + cl::Hidden, cl::init(2)); +} // namespace + +// WindowIILimit serves as an indicator of abnormal scheduling results and could +// potentially be referenced by the derived target window scheduler. +cl::opt + WindowIILimit("window-ii-limit", + cl::desc("The upper limit of II in the window algorithm."), + cl::Hidden, cl::init(1000)); + +WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) + : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), + Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), + TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { + TripleDAG = std::unique_ptr( + createMachineScheduler(/*OnlyBuildGraph=*/true)); +} + +bool WindowScheduler::run() { + if (!initialize()) { + LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n"); + return false; + } + // The window algorithm is time-consuming, and its compilation time should be + // taken into consideration. + TimeTraceScope Scope("WindowSearch"); + ++NumTryWindowSchedule; + // Performing the relevant processing before window scheduling. + preProcess(); + // The main window scheduling begins. + std::unique_ptr SchedDAG(createMachineScheduler()); + auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio); + for (unsigned Idx : SearchIndexes) { + OriToCycle.clear(); + ++NumTryWindowSearch; + // The scheduling starts with non-phi instruction, so SchedPhiNum needs to + // be added to Idx. + unsigned Offset = Idx + SchedPhiNum; + auto Range = getScheduleRange(Offset, SchedInstrNum); + SchedDAG->startBlock(MBB); + SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum); + SchedDAG->schedule(); + LLVM_DEBUG(SchedDAG->dump()); + unsigned II = analyseII(*SchedDAG, Offset); + if (II == WindowIILimit) { + restoreTripleMBB(); + LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n"); + ++NumFailAnalyseII; + continue; + } + schedulePhi(Offset, II); + updateScheduleResult(Offset, II); + restoreTripleMBB(); + LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " + << II << ".\n"); + } + // Performing the relevant processing after window scheduling. + postProcess(); + // Check whether the scheduling result is valid. + if (!isScheduleValid()) { + LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n"); + return false; + } + LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset + << " and Best II is " << BestII << ".\n"); + // Expand the scheduling result to prologue, kernel, and epilogue. + expand(); + ++NumWindowSchedule; + return true; +} + +ScheduleDAGInstrs * +WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { + return OnlyBuildGraph + ? new ScheduleDAGMI( + Context, std::make_unique(Context), + true) + : Context->PassConfig->createMachineScheduler(Context); +} + +bool WindowScheduler::initialize() { + if (!Subtarget->enableWindowScheduler()) { + LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n"); + return false; + } + // Initialized the member variables used by window algorithm. + OriMIs.clear(); + TriMIs.clear(); + TriToOri.clear(); + OriToCycle.clear(); + SchedResult.clear(); + SchedPhiNum = 0; + SchedInstrNum = 0; + BestII = UINT_MAX; + BestOffset = 0; + BaseII = 0; + // List scheduling used in the window algorithm depends on LiveIntervals. + if (!Context->LIS) { + LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n"); + return false; + } + // Check each MI in MBB. + SmallVector PhiDefs; + auto PLI = TII->analyzeLoopForPipelining(MBB); + for (auto &MI : *MBB) { + if (MI.isMetaInstruction() || MI.isTerminator()) + continue; + if (MI.isPHI()) { + for (auto Def : PhiDefs) + if (MI.readsRegister(Def, TRI)) { + LLVM_DEBUG( + dbgs() + << "Consecutive phis are not allowed in window scheduling!\n"); + return false; + } + for (auto Def : MI.defs()) + if (Def.isReg()) + PhiDefs.push_back(Def.getReg()); + ++SchedPhiNum; + ++BestOffset; + } else + ++SchedInstrNum; + if (TII->isSchedulingBoundary(MI, MBB, *MF)) { + LLVM_DEBUG( + dbgs() << "Boundary MI is not allowed in window scheduling!\n"); + return false; + } + if (PLI->shouldIgnoreForPipelining(&MI)) { + LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " + "window scheduling!\n"); + return false; + } + for (auto &Def : MI.all_defs()) + if (Def.isReg() && Def.getReg().isPhysical()) + return false; + } + if (SchedInstrNum <= WindowRegionLimit) { + LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n"); + return false; + } + return true; +} + +void WindowScheduler::preProcess() { + // Prior to window scheduling, it's necessary to backup the original MBB, + // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. + backupMBB(); + generateTripleMBB(); + TripleDAG->startBlock(MBB); + TripleDAG->enterRegion( + MBB, MBB->begin(), MBB->getFirstTerminator(), + std::distance(MBB->begin(), MBB->getFirstTerminator())); + TripleDAG->buildSchedGraph(Context->AA); +} + +void WindowScheduler::postProcess() { + // After window scheduling, it's necessary to clear the TripleDAG and restore + // to the original MBB. + TripleDAG->exitRegion(); + TripleDAG->finishBlock(); + restoreMBB(); +} + +void WindowScheduler::backupMBB() { + for (auto &MI : MBB->instrs()) + OriMIs.push_back(&MI); + // Remove MIs and the corresponding live intervals. + for (auto &MI : make_early_inc_range(*MBB)) { + Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); + MBB->remove(&MI); + } +} + +void WindowScheduler::restoreMBB() { + // Erase MIs and the corresponding live intervals. + for (auto &MI : make_early_inc_range(*MBB)) { + Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true); + MI.eraseFromParent(); + } + // Restore MBB to the state before window scheduling. + for (auto *MI : OriMIs) + MBB->push_back(MI); + updateLiveIntervals(); +} + +void WindowScheduler::generateTripleMBB() { + const unsigned DuplicateNum = 3; + TriMIs.clear(); + TriToOri.clear(); + assert(OriMIs.size() > 0 && "The Original MIs were not backed up!"); + // Step 1: Performing the first copy of MBB instructions, excluding + // terminators. At the same time, we back up the anti-register of phis. + // DefPairs hold the old and new define register pairs. + DenseMap DefPairs; + for (auto *MI : OriMIs) { + if (MI->isMetaInstruction() || MI->isTerminator()) + continue; + if (MI->isPHI()) + if (Register AntiReg = getAntiRegister(MI)) + DefPairs[MI->getOperand(0).getReg()] = AntiReg; + auto *NewMI = MF->CloneMachineInstr(MI); + MBB->push_back(NewMI); + TriMIs.push_back(NewMI); + TriToOri[NewMI] = MI; + } + // Step 2: Performing the remaining two copies of MBB instructions excluding + // phis, and the last one contains terminators. At the same time, registers + // are updated accordingly. + for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { + for (auto *MI : OriMIs) { + if (MI->isPHI() || MI->isMetaInstruction() || + (MI->isTerminator() && Cnt < DuplicateNum - 1)) + continue; + auto *NewMI = MF->CloneMachineInstr(MI); + DenseMap NewDefs; + // New defines are updated. + for (auto MO : NewMI->all_defs()) + if (MO.isReg() && MO.getReg().isVirtual()) { + Register NewDef = + MRI->createVirtualRegister(MRI->getRegClass(MO.getReg())); + NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI); + NewDefs[MO.getReg()] = NewDef; + } + // New uses are updated. + for (auto DefRegPair : DefPairs) + if (NewMI->readsRegister(DefRegPair.first, TRI)) { + Register NewUse = DefRegPair.second; + // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': + // + // BB.3: DefPairs + // ================================== + // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) + // ... + // ================================== + // ... + // %4 = sub i32 %1, %3 + // ... + // %7 = add i32 %5, %6 + // ... + // ---------------------------------- + // ... + // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) + // ... + // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) + // ... + // ---------------------------------- + // ... + // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) + // ... ^ + // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) + // ... + // ================================== + // < Terminators > + // ================================== + if (DefPairs.count(NewUse)) + NewUse = DefPairs[NewUse]; + NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI); + } + // DefPairs is updated at last. + for (auto &NewDef : NewDefs) + DefPairs[NewDef.first] = NewDef.second; + MBB->push_back(NewMI); + TriMIs.push_back(NewMI); + TriToOri[NewMI] = MI; + } + } + // Step 3: The registers used by phis are updated, and they are generated in + // the third copy of MBB. + // In the privious example, the old phi is: + // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] + // The new phi is: + // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] + for (auto &Phi : MBB->phis()) { + for (auto DefRegPair : DefPairs) + if (Phi.readsRegister(DefRegPair.first, TRI)) + Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI); + } + updateLiveIntervals(); +} + +void WindowScheduler::restoreTripleMBB() { + // After list scheduling, the MBB is restored in one traversal. + for (size_t I = 0; I < TriMIs.size(); ++I) { + auto *MI = TriMIs[I]; + auto OldPos = MBB->begin(); + std::advance(OldPos, I); + auto CurPos = MI->getIterator(); + if (CurPos != OldPos) { + MBB->splice(OldPos, MBB, CurPos); + Context->LIS->handleMove(*MI, /*UpdateFlags=*/false); + } + } +} + +SmallVector WindowScheduler::getSearchIndexes(unsigned SearchNum, + unsigned SearchRatio) { + // We use SearchRatio to get the index range, and then evenly get the indexes + // according to the SearchNum. This is a simple huristic. Depending on the + // characteristics of the target, more complex algorithms can be used for both + // performance and compilation time. + assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!"); + unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; + unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; + SmallVector SearchIndexes; + for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) + SearchIndexes.push_back(Idx); + return SearchIndexes; +} + +int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { + // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. + unsigned MaxDepth = 1; + for (auto &SU : DAG.SUnits) + MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth); + return MaxDepth * WindowIICoeff; +} + +int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, + unsigned Offset) { + int InitII = getEstimatedII(DAG); + ResourceManager RM(Subtarget, &DAG); + RM.init(InitII); + // ResourceManager and DAG are used to calculate the maximum cycle for the + // scheduled MIs. Since MIs in the Region have already been scheduled, the + // emit cycles can be estimated in order here. + int CurCycle = 0; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &MI : Range) { + auto *SU = DAG.getSUnit(&MI); + int ExpectCycle = CurCycle; + // The predecessors of current MI determine its earliest issue cycle. + for (auto &Pred : SU->Preds) { + auto *PredMI = Pred.getSUnit()->getInstr(); + int PredCycle = getOriCycle(PredMI); + ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency()); + } + // ResourceManager can be used to detect resource conflicts between the + // current MI and the previously inserted MIs. + while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) { + ++CurCycle; + if (CurCycle == (int)WindowIILimit) + return CurCycle; + } + RM.reserveResources(*SU, CurCycle); + OriToCycle[getOriMI(&MI)] = CurCycle; + LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." + << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); + } + LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n"); + return CurCycle; +} + +// By utilizing TripleDAG, we can easily establish dependencies between A and B. +// Based on the MaxCycle and the issue cycle of A and B, we can determine +// whether it is necessary to add a stall cycle. This is because, without +// inserting the stall cycle, the latency constraint between A and B cannot be +// satisfied. The details are as follows: +// +// New MBB: +// ======================================== +// < Phis > +// ======================================== (sliding direction) +// MBB copy 1 | +// V +// +// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- +// | | +// ===================V==================== | +// MBB copy 2 < MI B > | +// | +// < MI A > V +// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ +// : +// ===================V==================== +// MBB copy 3 < MI B'> +// +// +// +// +// ======================================== +// < Terminators > +// ======================================== +int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { + int MaxStallCycle = 0; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &MI : Range) { + auto *SU = TripleDAG->getSUnit(&MI); + int DefCycle = getOriCycle(&MI); + for (auto &Succ : SU->Succs) { + if (Succ.getSUnit() == &TripleDAG->ExitSU) + continue; + // If the expected cycle does not exceed MaxCycle, no check is needed. + if (DefCycle + (int)Succ.getLatency() <= MaxCycle) + continue; + // If the cycle of the scheduled MI A is less than that of the scheduled + // MI B, the scheduling will fail because the lifetime of the + // corresponding register exceeds II. + auto *SuccMI = Succ.getSUnit()->getInstr(); + int UseCycle = getOriCycle(SuccMI); + if (DefCycle < UseCycle) + return WindowIILimit; + // Get the stall cycle introduced by the register between two trips. + int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle; + MaxStallCycle = std::max(MaxStallCycle, StallCycle); + } + } + LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n"); + return MaxStallCycle; +} + +unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { + LLVM_DEBUG(dbgs() << "Start analyzing II:\n"); + int MaxCycle = calculateMaxCycle(DAG, Offset); + if (MaxCycle == (int)WindowIILimit) + return MaxCycle; + int StallCycle = calculateStallCycle(Offset, MaxCycle); + if (StallCycle == (int)WindowIILimit) + return StallCycle; + // The value of II is equal to the maximum execution cycle plus 1. + return MaxCycle + StallCycle + 1; +} + +void WindowScheduler::schedulePhi(int Offset, unsigned &II) { + LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n"); + for (auto &Phi : MBB->phis()) { + int LateCycle = INT_MAX; + auto *SU = TripleDAG->getSUnit(&Phi); + for (auto &Succ : SU->Succs) { + // Phi doesn't have any Anti successors. + if (Succ.getKind() != SDep::Data) + continue; + // Phi is scheduled before the successor of stage 0. The issue cycle of + // phi is the latest cycle in this interval. + auto *SuccMI = Succ.getSUnit()->getInstr(); + int Cycle = getOriCycle(SuccMI); + if (getOriStage(getOriMI(SuccMI), Offset) == 0) + LateCycle = std::min(LateCycle, Cycle); + } + // The anti-dependency of phi need to be handled separately in the same way. + if (Register AntiReg = getAntiRegister(&Phi)) { + auto *AntiMI = MRI->getVRegDef(AntiReg); + auto AntiCycle = getOriCycle(AntiMI); + if (getOriStage(getOriMI(AntiMI), Offset) == 0) + LateCycle = std::min(LateCycle, AntiCycle); + } + // If there is no limit to the late cycle, a default value is given. + if (LateCycle == INT_MAX) + LateCycle = (int)(II - 1); + LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); + // The issue cycle of phi is set to the latest cycle in the interval. + auto *OriPhi = getOriMI(&Phi); + OriToCycle[OriPhi] = LateCycle; + } +} + +DenseMap WindowScheduler::getIssueOrder(unsigned Offset, + unsigned II) { + // At each issue cycle, phi is placed before MIs in stage 0. So the simplest + // way is to put phi at the beginning of the current cycle. + DenseMap> CycleToMIs; + auto Range = getScheduleRange(Offset, SchedInstrNum); + for (auto &Phi : MBB->phis()) + CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi)); + for (auto &MI : Range) + CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI)); + // Each MI is assigned a separate ordered Id, which is used as a sort marker + // in the following expand process. + DenseMap IssueOrder; + int Id = 0; + for (int Cycle = 0; Cycle < (int)II; ++Cycle) { + if (!CycleToMIs.count(Cycle)) + continue; + for (auto *MI : CycleToMIs[Cycle]) + IssueOrder[MI] = Id++; + } + return IssueOrder; +} + +void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { + // At the first update, Offset is equal to SchedPhiNum. At this time, only + // BestII, BestOffset, and BaseII need to be updated. + if (Offset == SchedPhiNum) { + BestII = II; + BestOffset = SchedPhiNum; + BaseII = II; + return; + } + // The update will only continue if the II is smaller than BestII and the II + // is sufficiently small. + if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) + return; + BestII = II; + BestOffset = Offset; + // Record the result of the current list scheduling, noting that each MI is + // stored unordered in SchedResult. + SchedResult.clear(); + auto IssueOrder = getIssueOrder(Offset, II); + for (auto &Pair : OriToCycle) { + assert(IssueOrder.count(Pair.first) && "Cannot find original MI!"); + SchedResult.push_back(std::make_tuple(Pair.first, Pair.second, + getOriStage(Pair.first, Offset), + IssueOrder[Pair.first])); + } +} + +void WindowScheduler::expand() { + // The MIs in the SchedResult are sorted by the issue order ID. + llvm::stable_sort(SchedResult, + [](const std::tuple &A, + const std::tuple &B) { + return std::get<3>(A) < std::get<3>(B); + }); + // Use the scheduling infrastructure for expansion, noting that InstrChanges + // is not supported here. + DenseMap Cycles, Stages; + std::vector OrderedInsts; + for (auto &Info : SchedResult) { + auto *MI = std::get<0>(Info); + OrderedInsts.push_back(MI); + Cycles[MI] = std::get<1>(Info); + Stages[MI] = std::get<2>(Info); + LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] + << "]: " << *MI); + } + ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), + std::move(Stages)); + ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, + ModuloScheduleExpander::InstrChangesTy()); + MSE.expand(); + MSE.cleanup(); +} + +void WindowScheduler::updateLiveIntervals() { + SmallVector UsedRegs; + for (MachineInstr &MI : *MBB) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + Register Reg = MO.getReg(); + if (!is_contained(UsedRegs, Reg)) + UsedRegs.push_back(Reg); + } + Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs); +} + +iterator_range +WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { + auto RegionBegin = MBB->begin(); + std::advance(RegionBegin, Offset); + auto RegionEnd = RegionBegin; + std::advance(RegionEnd, Num); + return make_range(RegionBegin, RegionEnd); +} + +int WindowScheduler::getOriCycle(MachineInstr *NewMI) { + assert(TriToOri.count(NewMI) && "Cannot find original MI!"); + auto *OriMI = TriToOri[NewMI]; + assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!"); + return OriToCycle[OriMI]; +} + +MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { + assert(TriToOri.count(NewMI) && "Cannot find original MI!"); + return TriToOri[NewMI]; +} + +unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { + assert(llvm::find(OriMIs, OriMI) != OriMIs.end() && + "Cannot find OriMI in OriMIs!"); + // If there is no instruction fold, all MI stages are 0. + if (Offset == SchedPhiNum) + return 0; + // For those MIs with an ID less than the Offset, their stages are set to 0, + // while the rest are set to 1. + unsigned Id = 0; + for (auto *MI : OriMIs) { + if (MI->isMetaInstruction()) + continue; + if (MI == OriMI) + break; + ++Id; + } + return Id >= (size_t)Offset ? 1 : 0; +} + +Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { + assert(Phi->isPHI() && "Expecting PHI!"); + Register AntiReg; + for (auto MO : Phi->uses()) { + if (MO.isReg()) + AntiReg = MO.getReg(); + else if (MO.isMBB() && MO.getMBB()->getNumber() == MBB->getNumber()) + return AntiReg; + } + return 0; +} diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir new file mode 100644 index 0000000000000..b1549e39c910f --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir @@ -0,0 +1,131 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s + +# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}. +# CHECK-LABEL: name: exp_approx_top_six +# CHECK: bb.5.loop_body: +# CHECK: dead %{{[0-9]*}}:hvxvr = V6_vaddw %{{[0-9]*}}, %{{[0-9]*}} +# CHECK: ENDLOOP0 +# CHECK: bb.6: + +--- | + define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 { + entry: + %is_zero = icmp eq i32 %N, 0 + br i1 %is_zero, label %exit, label %loop_header + + loop_header: + %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216) + %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595) + %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379) + %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201) + %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993) + br label %loop_body + + exit: + ret void + + loop_body: + %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ] + %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ] + %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ] + %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128 + %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input) + %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input) + %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input) + %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input) + %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input) + %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2) + %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3) + %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4) + %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5) + %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6) + %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input) + %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1) + %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2) + %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3) + %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4) + %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5) + store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128 + %index_next = add nuw i32 %index, 32 + %loop_cond = icmp ult i32 %index_next, %N + %cgep = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128 + br i1 %loop_cond, label %loop_body, label %exit + } + + declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) + declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>) + + attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } +... +--- +name: exp_approx_top_six +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop_header: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1065353216 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1056964608 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1042983595 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_tfrsi 1026206379 + %11:hvxvr = V6_lvsplatw killed %10 + %12:intregs = A2_tfrsi 1007192201 + %13:hvxvr = V6_lvsplatw killed %12 + %14:intregs = A2_tfrsi 985008993 + %15:hvxvr = V6_lvsplatw killed %14 + %16:intregs = A2_addi %2, 31 + %17:intregs = S2_lsr_i_r %16, 5 + %18:intregs = COPY %17 + J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.loop_body (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %19:intregs = PHI %1, %bb.1, %20, %bb.3 + %21:intregs = PHI %0, %bb.1, %22, %bb.3 + %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1) + %24:hvxvr = V6_vmpyowh_rnd %23, %23 + %25:hvxvr = V6_vmpyowh_rnd %24, %23 + %26:hvxvr = V6_vmpyowh_rnd %25, %23 + %27:hvxvr = V6_vmpyowh_rnd %26, %23 + %28:hvxvr = V6_vmpyowh_rnd %27, %23 + %29:hvxvr = V6_vmpyowh_rnd %7, %24 + %30:hvxvr = V6_vmpyowh_rnd %9, %25 + %31:hvxvr = V6_vmpyowh_rnd %11, %26 + %32:hvxvr = V6_vmpyowh_rnd %13, %27 + %33:hvxvr = V6_vmpyowh_rnd %15, killed %28 + %34:hvxvr = V6_vaddw %5, %23 + %35:hvxvr = V6_vaddw killed %34, killed %29 + %36:hvxvr = V6_vaddw killed %35, killed %30 + %37:hvxvr = V6_vaddw killed %36, killed %31 + %38:hvxvr = V6_vaddw killed %37, killed %32 + %39:hvxvr = V6_vaddw %38, %33 + ; To check the dead virtual register within loop kernel. + dead %40:hvxvr = V6_vaddw killed %38, killed %33 + %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv) + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir new file mode 100644 index 0000000000000..b62cbbbb8af4e --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir @@ -0,0 +1,310 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s +# +# The Window Scheduling algorithm will discard the debug IR, just like the SMS +# algorithm does. Additionally, the MMO information in the IR is also preserved +# to ensure that no barrier dependencies are generated within the loop body. +# +# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}. +# CHECK-LABEL: name: exp_approx +# CHECK: bb.5.for.body: +# CHECK-NOT: DBG_VALUE +# CHECK: ENDLOOP0 +# CHECK: bb.6: + + +--- | + define void @exp_approx(i32 %num_elements, ptr noalias %input_ptr, ptr noalias %output_ptr) #0 !dbg !20 { + entry: + tail call void @llvm.dbg.value(metadata i32 %num_elements, metadata !33, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata ptr %input_ptr, metadata !34, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata ptr %output_ptr, metadata !35, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 1.000000e+00, metadata !36, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 5.000000e-01, metadata !37, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 0x3FC5555560000000, metadata !38, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 0x3FA5555560000000, metadata !39, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 0x3F81111120000000, metadata !40, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata float 0x3F56C16C20000000, metadata !41, metadata !DIExpression()), !dbg !69 + tail call void @llvm.dbg.value(metadata i32 0, metadata !42, metadata !DIExpression()), !dbg !70 + %is_zero_elements = icmp eq i32 %num_elements, 0, !dbg !71 + br i1 %is_zero_elements, label %for.cond.cleanup, label %for.body.lr.ph, !dbg !72 + + for.body.lr.ph: + %const_1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216) + %const_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %const_third = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595) + %const_quarter = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379) + %const_fifth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201) + %const_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993) + br label %for.body, !dbg !72 + + for.cond.cleanup: + ret void, !dbg !73 + + for.body: + %lsr.iv1 = phi ptr [ %cgep3, %for.body ], [ %input_ptr, %for.body.lr.ph ] + %lsr.iv = phi ptr [ %cgep, %for.body ], [ %output_ptr, %for.body.lr.ph ] + %index = phi i32 [ 0, %for.body.lr.ph ], [ %next_index, %for.body ] + tail call void @llvm.dbg.value(metadata i32 %index, metadata !42, metadata !DIExpression()), !dbg !70 + %input_values = load <32 x i32>, ptr %lsr.iv1, align 128, !dbg !74 + tail call void @llvm.dbg.value(metadata <32 x i32> %input_values, metadata !44, metadata !DIExpression()), !dbg !75 + %squared_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %input_values, <32 x i32> %input_values), !dbg !76 + tail call void @llvm.dbg.value(metadata <32 x i32> %squared_values, metadata !47, metadata !DIExpression()), !dbg !75 + %cubed_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %squared_values, <32 x i32> %input_values), !dbg !77 + tail call void @llvm.dbg.value(metadata <32 x i32> %cubed_values, metadata !48, metadata !DIExpression()), !dbg !75 + %quartic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %cubed_values, <32 x i32> %input_values), !dbg !78 + tail call void @llvm.dbg.value(metadata <32 x i32> %quartic_values, metadata !49, metadata !DIExpression()), !dbg !75 + %quintic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %quartic_values, <32 x i32> %input_values), !dbg !79 + tail call void @llvm.dbg.value(metadata <32 x i32> %quintic_values, metadata !50, metadata !DIExpression()), !dbg !75 + %sextic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %quintic_values, <32 x i32> %input_values), !dbg !80 + tail call void @llvm.dbg.value(metadata <32 x i32> %sextic_values, metadata !51, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_1, metadata !52, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_half, metadata !53, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_third, metadata !54, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_quarter, metadata !55, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_fifth, metadata !56, metadata !DIExpression()), !dbg !75 + tail call void @llvm.dbg.value(metadata <32 x i32> %const_sixth, metadata !57, metadata !DIExpression()), !dbg !75 + %product_half = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_half, <32 x i32> %squared_values), !dbg !81 + tail call void @llvm.dbg.value(metadata <32 x i32> %product_half, metadata !58, metadata !DIExpression()), !dbg !75 + %product_third = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_third, <32 x i32> %cubed_values), !dbg !82 + tail call void @llvm.dbg.value(metadata <32 x i32> %product_third, metadata !59, metadata !DIExpression()), !dbg !75 + %product_quarter = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_quarter, <32 x i32> %quartic_values), !dbg !83 + tail call void @llvm.dbg.value(metadata <32 x i32> %product_quarter, metadata !60, metadata !DIExpression()), !dbg !75 + %product_fifth = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_fifth, <32 x i32> %quintic_values), !dbg !84 + tail call void @llvm.dbg.value(metadata <32 x i32> %product_fifth, metadata !61, metadata !DIExpression()), !dbg !75 + %product_sixth = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_sixth, <32 x i32> %sextic_values), !dbg !85 + tail call void @llvm.dbg.value(metadata <32 x i32> %product_sixth, metadata !62, metadata !DIExpression()), !dbg !75 + %sum_1_input = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %const_1, <32 x i32> %input_values), !dbg !86 + tail call void @llvm.dbg.value(metadata <32 x i32> %sum_1_input, metadata !63, metadata !DIExpression()), !dbg !75 + %sum_half = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_1_input, <32 x i32> %product_half), !dbg !87 + tail call void @llvm.dbg.value(metadata <32 x i32> %sum_half, metadata !64, metadata !DIExpression()), !dbg !75 + %sum_third = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_half, <32 x i32> %product_third), !dbg !88 + tail call void @llvm.dbg.value(metadata <32 x i32> %sum_third, metadata !65, metadata !DIExpression()), !dbg !75 + %sum_quarter = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_third, <32 x i32> %product_quarter), !dbg !89 + tail call void @llvm.dbg.value(metadata <32 x i32> %sum_quarter, metadata !66, metadata !DIExpression()), !dbg !75 + %sum_fifth = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_quarter, <32 x i32> %product_fifth), !dbg !90 + tail call void @llvm.dbg.value(metadata <32 x i32> %sum_fifth, metadata !67, metadata !DIExpression()), !dbg !75 + %final_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_fifth, <32 x i32> %product_sixth), !dbg !91 + tail call void @llvm.dbg.value(metadata <32 x i32> %final_result, metadata !68, metadata !DIExpression()), !dbg !75 + store <32 x i32> %final_result, ptr %lsr.iv, align 128, !dbg !92 + %next_index = add nuw i32 %index, 32, !dbg !93 + tail call void @llvm.dbg.value(metadata i32 %next_index, metadata !42, metadata !DIExpression()), !dbg !70 + %continue_loop = icmp ult i32 %next_index, %num_elements, !dbg !71 + %cgep = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128 + br i1 %continue_loop, label %for.body, label %for.cond.cleanup, !dbg !72, !llvm.loop !94 + } + + declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) + declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>) + declare void @llvm.dbg.value(metadata, metadata, metadata) + + attributes #0 = { "target-features"="+hvx-length128b,+hvxv68,+v66,-long-calls" } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!13, !14, !15, !16, !17, !18, !19} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None) + !1 = !DIFile(filename: "exp_approx.cpp", directory: "/tmp") + !2 = !{!3, !10} + !3 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32) + !4 = !DIDerivedType(tag: DW_TAG_typedef, name: "HVX_Vector", file: !1, line: 11, baseType: !5) + !5 = !DIDerivedType(tag: DW_TAG_typedef, name: "Vect1024", file: !1, line: 4, baseType: !6, align: 1024) + !6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 1024, flags: DIFlagVector, elements: !8) + !7 = !DIBasicType(name: "long", size: 32, encoding: DW_ATE_signed) + !8 = !{!9} + !9 = !DISubrange(count: 32) + !10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32) + !11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12) + !12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !13 = !{i32 7, !"Dwarf Version", i32 5} + !14 = !{i32 2, !"Debug Info Version", i32 3} + !15 = !{i32 1, !"wchar_size", i32 4} + !16 = !{i32 8, !"PIC Level", i32 2} + !17 = !{i32 7, !"PIE Level", i32 2} + !18 = !{i32 7, !"frame-pointer", i32 2} + !19 = !{i32 7, !"debug-info-assignment-tracking", i1 true} + !20 = distinct !DISubprogram(name: "exp_approx", linkageName: "exp_approx", scope: !1, file: !1, line: 15, type: !21, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !32) + !21 = !DISubroutineType(types: !22) + !22 = !{null, !23, !26, !30} + !23 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !24, line: 13, baseType: !25) + !24 = !DIFile(filename: "__stddef_size_t.h", directory: "/tmp") + !25 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + !26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !27) + !27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 32) + !28 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !29) + !29 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + !30 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !31) + !31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !29, size: 32) + !32 = !{!33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !44, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68} + !33 = !DILocalVariable(name: "N", arg: 1, scope: !20, file: !1, line: 15, type: !23) + !34 = !DILocalVariable(name: "x", arg: 2, scope: !20, file: !1, line: 15, type: !26) + !35 = !DILocalVariable(name: "y", arg: 3, scope: !20, file: !1, line: 15, type: !30) + !36 = !DILocalVariable(name: "a", scope: !20, file: !1, line: 16, type: !28) + !37 = !DILocalVariable(name: "b", scope: !20, file: !1, line: 17, type: !28) + !38 = !DILocalVariable(name: "c", scope: !20, file: !1, line: 18, type: !28) + !39 = !DILocalVariable(name: "d", scope: !20, file: !1, line: 19, type: !28) + !40 = !DILocalVariable(name: "e", scope: !20, file: !1, line: 20, type: !28) + !41 = !DILocalVariable(name: "f", scope: !20, file: !1, line: 21, type: !28) + !42 = !DILocalVariable(name: "i", scope: !43, file: !1, line: 25, type: !23) + !43 = distinct !DILexicalBlock(scope: !20, file: !1, line: 25, column: 3) + !44 = !DILocalVariable(name: "v_x1", scope: !45, file: !1, line: 26, type: !4) + !45 = distinct !DILexicalBlock(scope: !46, file: !1, line: 25, column: 38) + !46 = distinct !DILexicalBlock(scope: !43, file: !1, line: 25, column: 3) + !47 = !DILocalVariable(name: "v_x2", scope: !45, file: !1, line: 28, type: !4) + !48 = !DILocalVariable(name: "v_x3", scope: !45, file: !1, line: 29, type: !4) + !49 = !DILocalVariable(name: "v_x4", scope: !45, file: !1, line: 30, type: !4) + !50 = !DILocalVariable(name: "v_x5", scope: !45, file: !1, line: 31, type: !4) + !51 = !DILocalVariable(name: "v_x6", scope: !45, file: !1, line: 32, type: !4) + !52 = !DILocalVariable(name: "v_a", scope: !45, file: !1, line: 33, type: !4) + !53 = !DILocalVariable(name: "v_b", scope: !45, file: !1, line: 34, type: !4) + !54 = !DILocalVariable(name: "v_c", scope: !45, file: !1, line: 35, type: !4) + !55 = !DILocalVariable(name: "v_d", scope: !45, file: !1, line: 36, type: !4) + !56 = !DILocalVariable(name: "v_e", scope: !45, file: !1, line: 37, type: !4) + !57 = !DILocalVariable(name: "v_f", scope: !45, file: !1, line: 38, type: !4) + !58 = !DILocalVariable(name: "v_1", scope: !45, file: !1, line: 39, type: !4) + !59 = !DILocalVariable(name: "v_2", scope: !45, file: !1, line: 40, type: !4) + !60 = !DILocalVariable(name: "v_3", scope: !45, file: !1, line: 41, type: !4) + !61 = !DILocalVariable(name: "v_4", scope: !45, file: !1, line: 42, type: !4) + !62 = !DILocalVariable(name: "v_5", scope: !45, file: !1, line: 43, type: !4) + !63 = !DILocalVariable(name: "vr_1", scope: !45, file: !1, line: 44, type: !4) + !64 = !DILocalVariable(name: "vr_2", scope: !45, file: !1, line: 45, type: !4) + !65 = !DILocalVariable(name: "vr_3", scope: !45, file: !1, line: 46, type: !4) + !66 = !DILocalVariable(name: "vr_4", scope: !45, file: !1, line: 47, type: !4) + !67 = !DILocalVariable(name: "vr_5", scope: !45, file: !1, line: 48, type: !4) + !68 = !DILocalVariable(name: "result", scope: !45, file: !1, line: 49, type: !4) + !69 = !DILocation(line: 0, scope: !20) + !70 = !DILocation(line: 0, scope: !43) + !71 = !DILocation(line: 25, column: 24, scope: !46) + !72 = !DILocation(line: 25, column: 3, scope: !43) + !73 = !DILocation(line: 53, column: 1, scope: !20) + !74 = !DILocation(line: 26, column: 22, scope: !45) + !75 = !DILocation(line: 0, scope: !45) + !76 = !DILocation(line: 28, column: 23, scope: !45) + !77 = !DILocation(line: 29, column: 23, scope: !45) + !78 = !DILocation(line: 30, column: 23, scope: !45) + !79 = !DILocation(line: 31, column: 23, scope: !45) + !80 = !DILocation(line: 32, column: 23, scope: !45) + !81 = !DILocation(line: 39, column: 22, scope: !45) + !82 = !DILocation(line: 40, column: 22, scope: !45) + !83 = !DILocation(line: 41, column: 22, scope: !45) + !84 = !DILocation(line: 42, column: 22, scope: !45) + !85 = !DILocation(line: 43, column: 22, scope: !45) + !86 = !DILocation(line: 44, column: 23, scope: !45) + !87 = !DILocation(line: 45, column: 23, scope: !45) + !88 = !DILocation(line: 46, column: 23, scope: !45) + !89 = !DILocation(line: 47, column: 23, scope: !45) + !90 = !DILocation(line: 48, column: 23, scope: !45) + !91 = !DILocation(line: 49, column: 25, scope: !45) + !92 = !DILocation(line: 51, column: 31, scope: !45) + !93 = !DILocation(line: 25, column: 31, scope: !46) + !94 = distinct !{!94, !72, !95} + !95 = !DILocation(line: 52, column: 3, scope: !43) + +... +--- +name: exp_approx +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + DBG_VALUE $r0, $noreg, !33, !DIExpression(), debug-location !69 + DBG_VALUE $r1, $noreg, !34, !DIExpression(), debug-location !69 + DBG_VALUE $r2, $noreg, !35, !DIExpression(), debug-location !69 + %0:intregs = COPY $r2 + DBG_VALUE %0, $noreg, !35, !DIExpression(), debug-location !69 + %1:intregs = COPY $r1 + DBG_VALUE %1, $noreg, !34, !DIExpression(), debug-location !69 + %2:intregs = COPY $r0 + DBG_VALUE %2, $noreg, !33, !DIExpression(), debug-location !69 + DBG_VALUE float 1.000000e+00, $noreg, !36, !DIExpression(), debug-location !69 + DBG_VALUE float 5.000000e-01, $noreg, !37, !DIExpression(), debug-location !69 + DBG_VALUE float 0x3FC5555560000000, $noreg, !38, !DIExpression(), debug-location !69 + DBG_VALUE float 0x3FA5555560000000, $noreg, !39, !DIExpression(), debug-location !69 + DBG_VALUE float 0x3F81111120000000, $noreg, !40, !DIExpression(), debug-location !69 + DBG_VALUE float 0x3F56C16C20000000, $noreg, !41, !DIExpression(), debug-location !69 + DBG_VALUE 0, $noreg, !42, !DIExpression(), debug-location !70 + %3:predregs = C2_cmpeqi %2, 0, debug-location !71 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc, debug-location !72 + J2_jump %bb.1, implicit-def dead $pc, debug-location !72 + + bb.1.for.body.lr.ph: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1065353216 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1056964608 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1042983595 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_tfrsi 1026206379 + %11:hvxvr = V6_lvsplatw killed %10 + %12:intregs = A2_tfrsi 1007192201 + %13:hvxvr = V6_lvsplatw killed %12 + %14:intregs = A2_tfrsi 985008993 + %15:hvxvr = V6_lvsplatw killed %14 + %16:intregs = A2_addi %2, 31, debug-location !72 + %17:intregs = S2_lsr_i_r %16, 5, debug-location !72 + %18:intregs = COPY %17, debug-location !72 + J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr, debug-location !72 + J2_jump %bb.3, implicit-def dead $pc, debug-location !72 + + bb.2.for.cond.cleanup: + PS_jmpret $r31, implicit-def dead $pc, debug-location !73 + + bb.3.for.body (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %19:intregs = PHI %1, %bb.1, %20, %bb.3 + %21:intregs = PHI %0, %bb.1, %22, %bb.3 + DBG_VALUE %23:intregs, $noreg, !42, !DIExpression(), debug-location !70 + %24:hvxvr, %20:intregs = V6_vL32b_pi %19, 128, debug-location !74 :: (load (s1024) from %ir.lsr.iv1) + DBG_VALUE %24, $noreg, !44, !DIExpression(), debug-location !75 + %25:hvxvr = V6_vmpyowh_rnd %24, %24, debug-location !76 + DBG_VALUE %25, $noreg, !47, !DIExpression(), debug-location !75 + %26:hvxvr = V6_vmpyowh_rnd %25, %24, debug-location !77 + DBG_VALUE %26, $noreg, !48, !DIExpression(), debug-location !75 + %27:hvxvr = V6_vmpyowh_rnd %26, %24, debug-location !78 + DBG_VALUE %27, $noreg, !49, !DIExpression(), debug-location !75 + %28:hvxvr = V6_vmpyowh_rnd %27, %24, debug-location !79 + DBG_VALUE %28, $noreg, !50, !DIExpression(), debug-location !75 + %29:hvxvr = V6_vmpyowh_rnd %28, %24, debug-location !80 + DBG_VALUE %29, $noreg, !51, !DIExpression(), debug-location !75 + DBG_VALUE %5, $noreg, !52, !DIExpression(), debug-location !75 + DBG_VALUE %7, $noreg, !53, !DIExpression(), debug-location !75 + DBG_VALUE %9, $noreg, !54, !DIExpression(), debug-location !75 + DBG_VALUE %11, $noreg, !55, !DIExpression(), debug-location !75 + DBG_VALUE %13, $noreg, !56, !DIExpression(), debug-location !75 + DBG_VALUE %15, $noreg, !57, !DIExpression(), debug-location !75 + %30:hvxvr = V6_vmpyowh_rnd %7, %25, debug-location !81 + DBG_VALUE %30, $noreg, !58, !DIExpression(), debug-location !75 + %31:hvxvr = V6_vmpyowh_rnd %9, %26, debug-location !82 + DBG_VALUE %31, $noreg, !59, !DIExpression(), debug-location !75 + %32:hvxvr = V6_vmpyowh_rnd %11, %27, debug-location !83 + DBG_VALUE %32, $noreg, !60, !DIExpression(), debug-location !75 + %33:hvxvr = V6_vmpyowh_rnd %13, %28, debug-location !84 + DBG_VALUE %33, $noreg, !61, !DIExpression(), debug-location !75 + %34:hvxvr = V6_vmpyowh_rnd %15, killed %29, debug-location !85 + DBG_VALUE %34, $noreg, !62, !DIExpression(), debug-location !75 + %35:hvxvr = V6_vaddw %5, %24, debug-location !86 + DBG_VALUE %35, $noreg, !63, !DIExpression(), debug-location !75 + %36:hvxvr = V6_vaddw killed %35, killed %30, debug-location !87 + DBG_VALUE %36, $noreg, !64, !DIExpression(), debug-location !75 + %37:hvxvr = V6_vaddw killed %36, killed %31, debug-location !88 + DBG_VALUE %37, $noreg, !65, !DIExpression(), debug-location !75 + %38:hvxvr = V6_vaddw killed %37, killed %32, debug-location !89 + DBG_VALUE %38, $noreg, !66, !DIExpression(), debug-location !75 + %39:hvxvr = V6_vaddw killed %38, killed %33, debug-location !90 + DBG_VALUE %39, $noreg, !67, !DIExpression(), debug-location !75 + %40:hvxvr = V6_vaddw killed %39, killed %34, debug-location !91 + DBG_VALUE %40, $noreg, !68, !DIExpression(), debug-location !75 + %22:intregs = V6_vS32b_pi %21, 128, killed %40, debug-location !92 :: (store (s1024) into %ir.lsr.iv) + DBG_VALUE $noreg, $noreg, !42, !DIExpression(), debug-location !70 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0, debug-location !72 + J2_jump %bb.2, implicit-def dead $pc, debug-location !72 + +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir new file mode 100644 index 0000000000000..534c25591f5bf --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir @@ -0,0 +1,124 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s + +# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}. + +--- | + define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 { + entry: + %is_zero = icmp eq i32 %N, 0 + br i1 %is_zero, label %exit, label %loop_header + + loop_header: + %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216) + %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595) + %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379) + %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201) + %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993) + br label %loop_body + + exit: + ret void + + loop_body: + %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ] + %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ] + %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ] + %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128 + %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input) + %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input) + %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input) + %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input) + %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input) + %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2) + %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3) + %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4) + %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5) + %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6) + %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input) + %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1) + %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2) + %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3) + %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4) + %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5) + store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128 + %index_next = add nuw i32 %index, 32 + %loop_cond = icmp ult i32 %index_next, %N + %cgep = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128 + br i1 %loop_cond, label %loop_body, label %exit + } + + declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) + declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>) + + attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } +... +--- +name: exp_approx_top_six +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop_header: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1065353216 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1056964608 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1042983595 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_tfrsi 1026206379 + %11:hvxvr = V6_lvsplatw killed %10 + %12:intregs = A2_tfrsi 1007192201 + %13:hvxvr = V6_lvsplatw killed %12 + %14:intregs = A2_tfrsi 985008993 + %15:hvxvr = V6_lvsplatw killed %14 + %16:intregs = A2_addi %2, 31 + %17:intregs = S2_lsr_i_r %16, 5 + %18:intregs = COPY %17 + J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.loop_body (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %19:intregs = PHI %1, %bb.1, %20, %bb.3 + %21:intregs = PHI %0, %bb.1, %22, %bb.3 + %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1) + %24:hvxvr = V6_vmpyowh_rnd %23, %23 + %25:hvxvr = V6_vmpyowh_rnd %24, %23 + %26:hvxvr = V6_vmpyowh_rnd %25, %23 + %27:hvxvr = V6_vmpyowh_rnd %26, %23 + %28:hvxvr = V6_vmpyowh_rnd %27, %23 + %29:hvxvr = V6_vmpyowh_rnd %7, %24 + %30:hvxvr = V6_vmpyowh_rnd %9, %25 + %31:hvxvr = V6_vmpyowh_rnd %11, %26 + %32:hvxvr = V6_vmpyowh_rnd %13, %27 + %33:hvxvr = V6_vmpyowh_rnd %15, killed %28 + %34:hvxvr = V6_vaddw %5, %23 + %35:hvxvr = V6_vaddw killed %34, killed %29 + %36:hvxvr = V6_vaddw killed %35, killed %30 + %37:hvxvr = V6_vaddw killed %36, killed %31 + %38:hvxvr = V6_vaddw killed %37, killed %32 + %39:hvxvr = V6_vaddw killed %38, killed %33 + %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv) + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir new file mode 100644 index 0000000000000..1721419cea9db --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir @@ -0,0 +1,56 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \ +# RUN: --check-prefix=CHECK-INITIALIZE +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -window-region-limit=1 -window-ii-limit=1 -o - \ +# RUN: 2>&1 | FileCheck %s --check-prefix=CHECK-ANALYSE-II +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -window-region-limit=1 -window-search-ratio=80 \ +# RUN: -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SCHED-NOT-NEEDED + +# CHECK-INITIALIZE: There are too few MIs in the window region! +# CHECK-INITIALIZE: The WindowScheduler failed to initialize! +# CHECK-ANALYSE-II: Can't find a valid II. Keep searching... +# CHECK-ANALYSE-II: Window scheduling is not needed! +# CHECK-SCHED-NOT-NEEDED: Window scheduling is not needed! + +--- +name: relu +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1: + successors: %bb.3(0x80000000) + + %4:hvxvr = V6_vd0 + %5:intregs = A2_addi %2, 31 + %6:intregs = S2_lsr_i_r %5, 5 + %7:intregs = COPY %6 + J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2: + PS_jmpret $r31, implicit-def dead $pc + + bb.3 (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %8:intregs = PHI %1, %bb.1, %9, %bb.3 + %10:intregs = PHI %0, %bb.1, %11, %bb.3 + %12:hvxvr, %9:intregs = V6_vL32b_pi %8, 128 + %13:hvxvr = V6_vmaxw killed %12, %4 + %11:intregs = V6_vS32b_pi %10, 128, killed %13 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir new file mode 100644 index 0000000000000..d0521a92585b0 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir @@ -0,0 +1,47 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \ +# RUN: --check-prefix=CHECK-SUCCESSIVE-PHI + +# CHECK-SUCCESSIVE-PHI: Consecutive phis are not allowed in window scheduling! +# CHECK-SUCCESSIVE-PHI: The WindowScheduler failed to initialize! + +--- +name: relu +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1: + successors: %bb.3(0x80000000) + + %4:hvxvr = V6_vd0 + %5:intregs = A2_addi %2, 31 + %6:intregs = S2_lsr_i_r %5, 5 + %7:intregs = COPY %6 + J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2: + PS_jmpret $r31, implicit-def dead $pc + + bb.3 (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %8:intregs = PHI %1, %bb.1, %9, %bb.3 + %10:intregs = PHI %0, %bb.1, %8, %bb.3 + %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128 + %12:hvxvr = V6_vmaxw killed %11, %4 + %13:intregs = V6_vS32b_pi %10, 128, killed %12 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir new file mode 100644 index 0000000000000..64229fd8d75cf --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir @@ -0,0 +1,73 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s + +# CHECK: The WindowScheduler failed to initialize! + +--- +name: exp_approx_top_six +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1065353216 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1056964608 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1042983595 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_tfrsi 1026206379 + %11:hvxvr = V6_lvsplatw killed %10 + %12:intregs = A2_tfrsi 1007192201 + %13:hvxvr = V6_lvsplatw killed %12 + %14:intregs = A2_tfrsi 985008993 + %15:hvxvr = V6_lvsplatw killed %14 + %16:intregs = A2_addi %2, 31 + %17:intregs = S2_lsr_i_r %16, 5 + %18:intregs = COPY %17 + J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2: + PS_jmpret $r31, implicit-def dead $pc + + bb.3 (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %19:intregs = PHI %1, %bb.1, %20, %bb.3 + %21:intregs = PHI %0, %bb.1, %22, %bb.3 + %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 + %24:hvxvr = V6_vmpyowh_rnd %23, %23 + %25:hvxvr = V6_vmpyowh_rnd %24, %23 + %26:hvxvr = V6_vmpyowh_rnd %25, %23 + %27:hvxvr = V6_vmpyowh_rnd %26, %23 + %28:hvxvr = V6_vmpyowh_rnd %27, %23 + %29:hvxvr = V6_vmpyowh_rnd %7, %24 + %30:hvxvr = V6_vmpyowh_rnd %9, %25 + %31:hvxvr = V6_vmpyowh_rnd %11, %26 + %32:hvxvr = V6_vmpyowh_rnd %13, %27 + %33:hvxvr = V6_vmpyowh_rnd %15, killed %28 + %34:hvxvr = V6_vaddw %5, %23 + %35:hvxvr = V6_vaddw killed %34, killed %29 + %36:hvxvr = V6_vaddw killed %35, killed %30 + %37:hvxvr = V6_vaddw killed %36, killed %31 + %38:hvxvr = V6_vaddw killed %37, killed %32 + %39:hvxvr = V6_vaddw killed %38, killed %33 + ; To check the implicitly defined phiscal register within loop kernel. + dead %40:intregs = A2_abssat %18, implicit-def dead $usr_ovf + %22:intregs = V6_vS32b_pi %21, 128, killed %39 + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir new file mode 100644 index 0000000000000..f12a72790741c --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir @@ -0,0 +1,127 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s + +# CHECK-NOT: PSEUDO_PROBE +# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}. + +--- | + define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 { + entry: + %is_zero = icmp eq i32 %N, 0 + br i1 %is_zero, label %exit, label %loop_header + + loop_header: + %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216) + %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595) + %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379) + %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201) + %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993) + br label %loop_body + + exit: + ret void + + loop_body: + %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ] + %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ] + %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ] + %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128 + %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input) + %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input) + %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input) + %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input) + %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input) + %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2) + %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3) + %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4) + %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5) + %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6) + %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input) + %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1) + %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2) + %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3) + %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4) + %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5) + store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128 + %index_next = add nuw i32 %index, 32 + %loop_cond = icmp ult i32 %index_next, %N + %cgep = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128 + br i1 %loop_cond, label %loop_body, label %exit + } + + declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) + declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>) + + attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } +... +--- +name: exp_approx_top_six +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop_header: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1065353216 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1056964608 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1042983595 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_tfrsi 1026206379 + %11:hvxvr = V6_lvsplatw killed %10 + %12:intregs = A2_tfrsi 1007192201 + %13:hvxvr = V6_lvsplatw killed %12 + %14:intregs = A2_tfrsi 985008993 + %15:hvxvr = V6_lvsplatw killed %14 + %16:intregs = A2_addi %2, 31 + %17:intregs = S2_lsr_i_r %16, 5 + %18:intregs = COPY %17 + J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.loop_body (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %19:intregs = PHI %1, %bb.1, %20, %bb.3 + %21:intregs = PHI %0, %bb.1, %22, %bb.3 + %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1) + %24:hvxvr = V6_vmpyowh_rnd %23, %23 + %25:hvxvr = V6_vmpyowh_rnd %24, %23 + %26:hvxvr = V6_vmpyowh_rnd %25, %23 + %27:hvxvr = V6_vmpyowh_rnd %26, %23 + %28:hvxvr = V6_vmpyowh_rnd %27, %23 + %29:hvxvr = V6_vmpyowh_rnd %7, %24 + %30:hvxvr = V6_vmpyowh_rnd %9, %25 + %31:hvxvr = V6_vmpyowh_rnd %11, %26 + %32:hvxvr = V6_vmpyowh_rnd %13, %27 + ; To check the meta MI within loop kernel. + PSEUDO_PROBE 128, 1, 0, 0 + %33:hvxvr = V6_vmpyowh_rnd %15, killed %28 + %34:hvxvr = V6_vaddw %5, %23 + %35:hvxvr = V6_vaddw killed %34, killed %29 + %36:hvxvr = V6_vaddw killed %35, killed %30 + %37:hvxvr = V6_vaddw killed %36, killed %31 + %38:hvxvr = V6_vaddw killed %37, killed %32 + %39:hvxvr = V6_vaddw killed %38, killed %33 + %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv) + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir new file mode 100644 index 0000000000000..1e764d5fa48b4 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir @@ -0,0 +1,124 @@ +# REQUIRES: asserts +# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ +# RUN: -window-sched=force -o - 2>&1 | FileCheck %s + +# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}. + +--- | + define void @sqrt_approx(i32 noundef %N, ptr noalias %x, ptr noalias %y) #0 { + entry: + %isZeroLength = icmp eq i32 %N, 0 + br i1 %isZeroLength, label %loop.exit, label %loop.preheader + + loop.preheader: ; preds = %entry + %half_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608) + %one_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216) + %two_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1073741824) + br label %loop.body + + loop.exit: ; preds = %loop.body, %entry + ret void + + loop.body: ; preds = %loop.body, %loop.preheader + %lsr.iv1 = phi ptr [ %cgep3, %loop.body ], [ %x, %loop.preheader ] + %lsr.iv = phi ptr [ %cgep, %loop.body ], [ %y, %loop.preheader ] + %index = phi i32 [ 0, %loop.preheader ], [ %index.next, %loop.body ] + %vec_x = load <32 x i32>, ptr %lsr.iv1, align 128 + %vec_sqrt_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %one_splat, <32 x i32> %vec_x) + %vec_sqrt_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_1, <32 x i32> %half_splat) + %vec_recip_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %half_splat) + %vec_recip_2 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_1) + %vec_y1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_recip_2) + %vec_recip_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_y1) + %vec_recop_4 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_3) + %vec_y2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y1, <32 x i32> %vec_recop_4) + %vec_sqrt_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y2) + %vec_sqrt_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y2, <32 x i32> %vec_sqrt_3) + %vec_sqrt_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_4, <32 x i32> %half_splat) + %vec_recip_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %half_splat) + %vec_recip_6 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_5) + %vec_y3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_recip_6) + %vec_recip_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_y3) + %vec_recop_8 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_7) + %vec_y4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y3, <32 x i32> %vec_recop_8) + %vec_sqrt_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y4) + %vec_sqrt_8 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y4, <32 x i32> %vec_sqrt_7) + %vec_sqrt_9 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_8, <32 x i32> %half_splat) + store <32 x i32> %vec_sqrt_9, ptr %lsr.iv, align 128 + %index.next = add nuw i32 %index, 32 + %continue = icmp ult i32 %index.next, %N + %cgep = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128 + br i1 %continue, label %loop.body, label %loop.exit + } + + declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) + declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>) + declare <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32>, <32 x i32>) + + attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } +... +--- +name: sqrt_approx +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2 + + %0:intregs = COPY $r2 + %1:intregs = COPY $r1 + %2:intregs = COPY $r0 + %3:predregs = C2_cmpeqi %2, 0 + J2_jumpt killed %3, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.loop.preheader: + successors: %bb.3(0x80000000) + + %4:intregs = A2_tfrsi 1056964608 + %5:hvxvr = V6_lvsplatw killed %4 + %6:intregs = A2_tfrsi 1065353216 + %7:hvxvr = V6_lvsplatw killed %6 + %8:intregs = A2_tfrsi 1073741824 + %9:hvxvr = V6_lvsplatw killed %8 + %10:intregs = A2_addi %2, 31 + %11:intregs = S2_lsr_i_r %10, 5 + %12:intregs = COPY %11 + J2_loop0r %bb.3, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.3, implicit-def dead $pc + + bb.2.loop.exit: + PS_jmpret $r31, implicit-def dead $pc + + bb.3.loop.body (machine-block-address-taken): + successors: %bb.3(0x7c000000), %bb.2(0x04000000) + + %13:intregs = PHI %1, %bb.1, %14, %bb.3 + %15:intregs = PHI %0, %bb.1, %16, %bb.3 + %17:hvxvr, %14:intregs = V6_vL32b_pi %13, 128 :: (load (s1024) from %ir.lsr.iv1) + %18:hvxvr = V6_vaddw %7, %17 + %19:hvxvr = V6_vmpyowh_rnd killed %18, %5 + %20:hvxvr = V6_vmpyowh_rnd %19, %5 + %21:hvxvr = V6_vsubw %9, killed %20 + %22:hvxvr = V6_vmpyowh_rnd %19, killed %21 + %23:hvxvr = V6_vmpyowh_rnd %19, %22 + %24:hvxvr = V6_vsubw %9, killed %23 + %25:hvxvr = V6_vmpyowh_rnd %22, killed %24 + %26:hvxvr = V6_vmpyowh_rnd %17, %25 + %27:hvxvr = V6_vaddw %25, killed %26 + %28:hvxvr = V6_vmpyowh_rnd killed %27, %5 + %29:hvxvr = V6_vmpyowh_rnd %28, %5 + %30:hvxvr = V6_vsubw %9, killed %29 + %31:hvxvr = V6_vmpyowh_rnd %28, killed %30 + %32:hvxvr = V6_vmpyowh_rnd %28, %31 + %33:hvxvr = V6_vsubw %9, killed %32 + %34:hvxvr = V6_vmpyowh_rnd %31, killed %33 + %35:hvxvr = V6_vmpyowh_rnd %17, %34 + %36:hvxvr = V6_vaddw %34, killed %35 + %37:hvxvr = V6_vmpyowh_rnd killed %36, %5 + %16:intregs = V6_vS32b_pi %15, 128, killed %37 :: (store (s1024) into %ir.lsr.iv) + ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc +...