diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 8f0a17cf99967..94913f534fb77 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -44,10 +44,12 @@
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/WindowScheduler.h"
 #include "llvm/InitializePasses.h"
 
 #include <deque>
@@ -107,6 +109,9 @@ class MachinePipeliner : public MachineFunctionPass {
   bool scheduleLoop(MachineLoop &L);
   bool swingModuloScheduler(MachineLoop &L);
   void setPragmaPipelineOptions(MachineLoop &L);
+  bool runWindowScheduler(MachineLoop &L);
+  bool useSwingModuloScheduler();
+  bool useWindowScheduler(bool Changed);
 };
 
 /// This class builds the dependence graph for the instructions in a loop,
@@ -449,7 +454,7 @@ class ResourceManager {
   const MCSchedModel &SM;
   const TargetSubtargetInfo *ST;
   const TargetInstrInfo *TII;
-  SwingSchedulerDAG *DAG;
+  ScheduleDAGInstrs *DAG;
   const bool UseDFA;
   /// DFA resources for each slot
   llvm::SmallVector<std::unique_ptr<DFAPacketizer>> DFAResources;
@@ -493,7 +498,7 @@ class ResourceManager {
 #endif
 
 public:
-  ResourceManager(const TargetSubtargetInfo *ST, SwingSchedulerDAG *DAG)
+  ResourceManager(const TargetSubtargetInfo *ST, ScheduleDAGInstrs *DAG)
       : STI(ST), SM(ST->getSchedModel()), ST(ST), TII(ST->getInstrInfo()),
         DAG(DAG), UseDFA(ST->useDFAforSMS()),
         ProcResourceMasks(SM.getNumProcResourceKinds(), 0),
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 7f8ed5c501989..557249f9dbde1 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -199,6 +199,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// True if the subtarget should run MachinePipeliner
   virtual bool enableMachinePipeliner() const { return true; };
 
+  /// True if the subtarget should run WindowScheduler.
+  virtual bool enableWindowScheduler() const { return true; }
+
   /// True if the subtarget should enable joining global copies.
   ///
   /// By default this is enabled if the machine scheduler is enabled, but
diff --git a/llvm/include/llvm/CodeGen/WindowScheduler.h b/llvm/include/llvm/CodeGen/WindowScheduler.h
new file mode 100644
index 0000000000000..476d5ada27876
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/WindowScheduler.h
@@ -0,0 +1,171 @@
+//======----------- WindowScheduler.cpp - window scheduler -------------======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Window Scheduling software pipelining algorithm.
+//
+// The concept of the window algorithm was first unveiled in Steven Muchnick's
+// book, "Advanced Compiler Design And Implementation", and later elaborated
+// upon in Venkatraman Govindaraju's report, "Implementation of Software
+// Pipelining Using Window Scheduling".
+//
+// The window algorithm can be perceived as a modulo scheduling algorithm with a
+// stage count of 2. It boasts a higher scheduling success rate in targets with
+// severe resource conflicts when compared to the classic Swing Modulo
+// Scheduling (SMS) algorithm. To align with the LLVM scheduling framework, we
+// have enhanced the original window algorithm. The primary steps are as
+// follows:
+//
+// 1. Instead of duplicating the original MBB twice as mentioned in the
+// literature, we copy it three times, generating TripleMBB and the
+// corresponding TripleDAG.
+//
+// 2. We establish a scheduling window on TripleMBB and execute list scheduling
+// within it.
+//
+// 3. After multiple list scheduling, we select the best outcome and expand it
+// into the final scheduling result.
+//
+// To cater to the needs of various targets, we have developed the window
+// scheduler in a form that is easily derivable. We recommend employing this
+// algorithm in targets with severe resource conflicts, and it can be utilized
+// either before or after the Register Allocator (RA).
+//
+// The default implementation provided here is before RA. If it is to be used
+// after RA, certain critical algorithm functions will need to be derived.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CODEGEN_WINDOWSCHEDULER_H
+#define LLVM_CODEGEN_WINDOWSCHEDULER_H
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+namespace llvm {
+
+enum WindowSchedulingFlag {
+  WS_Off,  /// Turn off window algorithm.
+  WS_On,   /// Use window algorithm after SMS algorithm fails.
+  WS_Force /// Use window algorithm instead of SMS algorithm.
+};
+
+/// The main class in the implementation of the target independent window
+/// scheduler.
+class WindowScheduler {
+protected:
+  MachineSchedContext *Context = nullptr;
+  MachineFunction *MF = nullptr;
+  MachineBasicBlock *MBB = nullptr;
+  MachineLoop &Loop;
+  const TargetSubtargetInfo *Subtarget = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  /// To innovatively identify the dependencies between MIs across two trips, we
+  /// construct a DAG for a new MBB, which is created by copying the original
+  /// MBB three times. We refer to this new MBB as 'TripleMBB' and the
+  /// corresponding DAG as 'TripleDAG'.
+  /// If the dependencies are more than two trips, we avoid applying window
+  /// algorithm by identifying successive phis in the old MBB.
+  std::unique_ptr<ScheduleDAGInstrs> TripleDAG;
+  /// OriMIs keeps the MIs removed from the original MBB.
+  SmallVector<MachineInstr *> OriMIs;
+  /// TriMIs keeps the MIs of TripleMBB, which is used to restore TripleMBB.
+  SmallVector<MachineInstr *> TriMIs;
+  /// TriToOri keeps the mappings between the MI clones in TripleMBB and their
+  /// original MI.
+  DenseMap<MachineInstr *, MachineInstr *> TriToOri;
+  /// OriToCycle keeps the mappings between the original MI and its issue cycle.
+  DenseMap<MachineInstr *, int> OriToCycle;
+  /// SchedResult keeps the result of each list scheduling, and the format of
+  /// the tuple is <MI pointer, Cycle, Stage, Order ID>.
+  SmallVector<std::tuple<MachineInstr *, int, int, int>, 256> SchedResult;
+  /// SchedPhiNum records the number of phi in the original MBB, and the
+  /// scheduling starts with MI after phis.
+  unsigned SchedPhiNum = 0;
+  /// SchedInstrNum records the MIs involved in scheduling in the original MBB,
+  /// excluding debug instructions.
+  unsigned SchedInstrNum = 0;
+  /// BestII and BestOffset record the characteristics of the best scheduling
+  /// result and are used together with SchedResult as the final window
+  /// scheduling result.
+  unsigned BestII = UINT_MAX;
+  unsigned BestOffset = 0;
+  /// BaseII is the II obtained when the window offset is SchedPhiNum. This
+  /// offset is the initial position of the sliding window.
+  unsigned BaseII = 0;
+
+public:
+  WindowScheduler(MachineSchedContext *C, MachineLoop &ML);
+  virtual ~WindowScheduler() {}
+
+  bool run();
+
+protected:
+  /// Two types of ScheduleDAGs are needed, one for creating dependency graphs
+  /// only, and the other for list scheduling as determined by the target.
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(bool OnlyBuildGraph = false);
+  /// Initializes the algorithm and determines if it can be executed.
+  virtual bool initialize();
+  /// Add some related processing before running window scheduling.
+  virtual void preProcess();
+  /// Add some related processing after running window scheduling.
+  virtual void postProcess();
+  /// Back up the MIs in the original MBB and remove them from MBB.
+  void backupMBB();
+  /// Erase the MIs in current MBB and restore the original MIs.
+  void restoreMBB();
+  /// Make three copies of the original MBB to generate a new TripleMBB.
+  virtual void generateTripleMBB();
+  /// Restore the order of MIs in TripleMBB after each list scheduling.
+  virtual void restoreTripleMBB();
+  /// Give the folding position in the window algorithm, where different
+  /// heuristics can be used. It determines the performance and compilation time
+  /// of the algorithm.
+  virtual SmallVector<unsigned> getSearchIndexes(unsigned SearchNum,
+                                                 unsigned SearchRatio);
+  /// Calculate MIs execution cycle after list scheduling.
+  virtual int calculateMaxCycle(ScheduleDAGInstrs &DAG, unsigned Offset);
+  /// Calculate the stall cycle between two trips after list scheduling.
+  virtual int calculateStallCycle(unsigned Offset, int MaxCycle);
+  /// Analyzes the II value after each list scheduling.
+  virtual unsigned analyseII(ScheduleDAGInstrs &DAG, unsigned Offset);
+  /// Phis are scheduled separately after each list scheduling.
+  virtual void schedulePhi(int Offset, unsigned &II);
+  /// Get the final issue order of all scheduled MIs including phis.
+  DenseMap<MachineInstr *, int> getIssueOrder(unsigned Offset, unsigned II);
+  /// Update the scheduling result after each list scheduling.
+  virtual void updateScheduleResult(unsigned Offset, unsigned II);
+  /// Check whether the final result of window scheduling is valid.
+  virtual bool isScheduleValid() { return BestOffset != SchedPhiNum; }
+  /// Using the scheduling infrastructure to expand the results of window
+  /// scheduling. It is usually necessary to add prologue and epilogue MBBs.
+  virtual void expand();
+  /// Update the live intervals for all registers used within MBB.
+  virtual void updateLiveIntervals();
+  /// Estimate a II value at which all MIs will be scheduled successfully.
+  int getEstimatedII(ScheduleDAGInstrs &DAG);
+  /// Gets the iterator range of MIs in the scheduling window.
+  iterator_range<MachineBasicBlock::iterator> getScheduleRange(unsigned Offset,
+                                                               unsigned Num);
+  /// Get the issue cycle of the new MI based on the cycle of the original MI.
+  int getOriCycle(MachineInstr *NewMI);
+  /// Get the original MI from which the new MI is cloned.
+  MachineInstr *getOriMI(MachineInstr *NewMI);
+  /// Get the scheduling stage, where the stage of the new MI is identical to
+  /// the original MI.
+  unsigned getOriStage(MachineInstr *OriMI, unsigned Offset);
+  /// Gets the register in phi which is generated from the current MBB.
+  Register getAntiRegister(MachineInstr *Phi);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 2c24de60edd43..d8780718669d0 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -244,6 +244,7 @@ add_llvm_component_library(LLVMCodeGen
   VLIWMachineScheduler.cpp
   VirtRegMap.cpp
   WasmEHPrepare.cpp
+  WindowScheduler.cpp
   WinEHPrepare.cpp
   XRayInstrumentation.cpp
   ${GeneratedMLSources}
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d8cb681688339..7e88f68fa3fa3 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -68,6 +68,7 @@
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
@@ -206,6 +207,17 @@ cl::opt<int> SwpForceIssueWidth(
     cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
     cl::init(-1));
 
+/// A command line argument to set the window scheduling option.
+cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
+    "window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On),
+    cl::desc("Set how to use window scheduling algorithm."),
+    cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off",
+                          "Turn off window algorithm."),
+               clEnumValN(WindowSchedulingFlag::WS_On, "on",
+                          "Use window algorithm after SMS algorithm fails."),
+               clEnumValN(WindowSchedulingFlag::WS_Force, "force",
+                          "Use window algorithm instead of SMS algorithm.")));
+
 } // end namespace llvm
 
 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
@@ -292,8 +304,11 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
   }
 
   ++NumTrytoPipeline;
+  if (useSwingModuloScheduler())
+    Changed = swingModuloScheduler(L);
 
-  Changed = swingModuloScheduler(L);
+  if (useWindowScheduler(Changed))
+    Changed = runWindowScheduler(L);
 
   LI.LoopPipelinerInfo.reset();
   return Changed;
@@ -484,9 +499,35 @@ void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineDominatorTree>();
   AU.addRequired<LiveIntervals>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
+  MachineSchedContext Context;
+  Context.MF = MF;
+  Context.MLI = MLI;
+  Context.MDT = MDT;
+  Context.PassConfig = &getAnalysis<TargetPassConfig>();
+  Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  Context.LIS = &getAnalysis<LiveIntervals>();
+  Context.RegClassInfo->runOnMachineFunction(*MF);
+  WindowScheduler WS(&Context, L);
+  return WS.run();
+}
+
+bool MachinePipeliner::useSwingModuloScheduler() {
+  // SwingModuloScheduler does not work when WindowScheduler is forced.
+  return WindowSchedulingOption != WindowSchedulingFlag::WS_Force;
+}
+
+bool MachinePipeliner::useWindowScheduler(bool Changed) {
+  // WindowScheduler does not work when it is off or when SwingModuloScheduler
+  // is successfully scheduled.
+  return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
+         (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
+}
+
 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
   if (SwpForceII > 0)
     MII = SwpForceII;
diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
new file mode 100644
index 0000000000000..7a7e4ca979925
--- /dev/null
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -0,0 +1,688 @@
+//======----------- WindowScheduler.cpp - window scheduler -------------======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Window Scheduling software pipelining algorithm.
+//
+// The fundamental concept of the window scheduling algorithm involves folding
+// the original MBB at a specific position, followed by list scheduling on the
+// folded MIs. The optimal scheduling result is then chosen from various folding
+// positions as the final scheduling outcome.
+//
+// The primary challenge in this algorithm lies in generating the folded MIs and
+// establishing their dependencies. We have innovatively employed a new MBB,
+// created by copying the original MBB three times, known as TripleMBB. This
+// TripleMBB enables the convenient implementation of MI folding and dependency
+// establishment. To facilitate the algorithm's implementation, we have also
+// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
+//
+// Another challenge in the algorithm is the scheduling of phis. Semantically,
+// it is difficult to place the phis in the window and perform list scheduling.
+// Therefore, we schedule these phis separately after each list scheduling.
+//
+// The provided implementation is designed for use before the Register Allocator
+// (RA). If the target requires implementation after RA, it is recommended to
+// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
+// target-specific logic can be added in initialize(), preProcess(), and
+// postProcess().
+//
+// Lastly, it is worth mentioning that getSearchIndexes() is an important
+// function. We have experimented with more complex heuristics on downstream
+// target and achieved favorable results.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/WindowScheduler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePipeliner.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pipeliner"
+
+namespace {
+STATISTIC(NumTryWindowSchedule,
+          "Number of loops that we attempt to use window scheduling");
+STATISTIC(NumTryWindowSearch,
+          "Number of times that we run list schedule in the window scheduling");
+STATISTIC(NumWindowSchedule,
+          "Number of loops that we successfully use window scheduling");
+STATISTIC(NumFailAnalyseII,
+          "Window scheduling abort due to the failure of the II analysis");
+
+cl::opt<unsigned>
+    WindowSearchNum("window-search-num",
+                    cl::desc("The number of searches per loop in the window "
+                             "algorithm. 0 means no search number limit."),
+                    cl::Hidden, cl::init(6));
+
+cl::opt<unsigned> WindowSearchRatio(
+    "window-search-ratio",
+    cl::desc("The ratio of searches per loop in the window algorithm. 100 "
+             "means search all positions in the loop, while 0 means not "
+             "performing any search."),
+    cl::Hidden, cl::init(40));
+
+cl::opt<unsigned> WindowIICoeff(
+    "window-ii-coeff",
+    cl::desc(
+        "The coefficient used when initializing II in the window algorithm."),
+    cl::Hidden, cl::init(5));
+
+cl::opt<unsigned> WindowRegionLimit(
+    "window-region-limit",
+    cl::desc(
+        "The lower limit of the scheduling region in the window algorithm."),
+    cl::Hidden, cl::init(3));
+
+cl::opt<unsigned> WindowDiffLimit(
+    "window-diff-limit",
+    cl::desc("The lower limit of the difference between best II and base II in "
+             "the window algorithm. If the difference is smaller than "
+             "this lower limit, window scheduling will not be performed."),
+    cl::Hidden, cl::init(2));
+} // namespace
+
+// WindowIILimit serves as an indicator of abnormal scheduling results and could
+// potentially be referenced by the derived target window scheduler.
+cl::opt<unsigned>
+    WindowIILimit("window-ii-limit",
+                  cl::desc("The upper limit of II in the window algorithm."),
+                  cl::Hidden, cl::init(1000));
+
+WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
+    : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
+      Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
+      TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
+  TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
+      createMachineScheduler(/*OnlyBuildGraph=*/true));
+}
+
+bool WindowScheduler::run() {
+  if (!initialize()) {
+    LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
+    return false;
+  }
+  // The window algorithm is time-consuming, and its compilation time should be
+  // taken into consideration.
+  TimeTraceScope Scope("WindowSearch");
+  ++NumTryWindowSchedule;
+  // Performing the relevant processing before window scheduling.
+  preProcess();
+  // The main window scheduling begins.
+  std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
+  auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
+  for (unsigned Idx : SearchIndexes) {
+    OriToCycle.clear();
+    ++NumTryWindowSearch;
+    // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
+    // be added to Idx.
+    unsigned Offset = Idx + SchedPhiNum;
+    auto Range = getScheduleRange(Offset, SchedInstrNum);
+    SchedDAG->startBlock(MBB);
+    SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
+    SchedDAG->schedule();
+    LLVM_DEBUG(SchedDAG->dump());
+    unsigned II = analyseII(*SchedDAG, Offset);
+    if (II == WindowIILimit) {
+      restoreTripleMBB();
+      LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
+      ++NumFailAnalyseII;
+      continue;
+    }
+    schedulePhi(Offset, II);
+    updateScheduleResult(Offset, II);
+    restoreTripleMBB();
+    LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
+                      << II << ".\n");
+  }
+  // Performing the relevant processing after window scheduling.
+  postProcess();
+  // Check whether the scheduling result is valid.
+  if (!isScheduleValid()) {
+    LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
+                    << " and Best II is " << BestII << ".\n");
+  // Expand the scheduling result to prologue, kernel, and epilogue.
+  expand();
+  ++NumWindowSchedule;
+  return true;
+}
+
+ScheduleDAGInstrs *
+WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
+  return OnlyBuildGraph
+             ? new ScheduleDAGMI(
+                   Context, std::make_unique<PostGenericScheduler>(Context),
+                   true)
+             : Context->PassConfig->createMachineScheduler(Context);
+}
+
+bool WindowScheduler::initialize() {
+  if (!Subtarget->enableWindowScheduler()) {
+    LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
+    return false;
+  }
+  // Initialized the member variables used by window algorithm.
+  OriMIs.clear();
+  TriMIs.clear();
+  TriToOri.clear();
+  OriToCycle.clear();
+  SchedResult.clear();
+  SchedPhiNum = 0;
+  SchedInstrNum = 0;
+  BestII = UINT_MAX;
+  BestOffset = 0;
+  BaseII = 0;
+  // List scheduling used in the window algorithm depends on LiveIntervals.
+  if (!Context->LIS) {
+    LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
+    return false;
+  }
+  // Check each MI in MBB.
+  SmallVector<Register, 8> PhiDefs;
+  auto PLI = TII->analyzeLoopForPipelining(MBB);
+  for (auto &MI : *MBB) {
+    if (MI.isMetaInstruction() || MI.isTerminator())
+      continue;
+    if (MI.isPHI()) {
+      for (auto Def : PhiDefs)
+        if (MI.readsRegister(Def, TRI)) {
+          LLVM_DEBUG(
+              dbgs()
+              << "Consecutive phis are not allowed in window scheduling!\n");
+          return false;
+        }
+      for (auto Def : MI.defs())
+        if (Def.isReg())
+          PhiDefs.push_back(Def.getReg());
+      ++SchedPhiNum;
+      ++BestOffset;
+    } else
+      ++SchedInstrNum;
+    if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
+      LLVM_DEBUG(
+          dbgs() << "Boundary MI is not allowed in window scheduling!\n");
+      return false;
+    }
+    if (PLI->shouldIgnoreForPipelining(&MI)) {
+      LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
+                           "window scheduling!\n");
+      return false;
+    }
+    for (auto &Def : MI.all_defs())
+      if (Def.isReg() && Def.getReg().isPhysical())
+        return false;
+  }
+  if (SchedInstrNum <= WindowRegionLimit) {
+    LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
+    return false;
+  }
+  return true;
+}
+
+void WindowScheduler::preProcess() {
+  // Prior to window scheduling, it's necessary to backup the original MBB,
+  // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
+  backupMBB();
+  generateTripleMBB();
+  TripleDAG->startBlock(MBB);
+  TripleDAG->enterRegion(
+      MBB, MBB->begin(), MBB->getFirstTerminator(),
+      std::distance(MBB->begin(), MBB->getFirstTerminator()));
+  TripleDAG->buildSchedGraph(Context->AA);
+}
+
+void WindowScheduler::postProcess() {
+  // After window scheduling, it's necessary to clear the TripleDAG and restore
+  // to the original MBB.
+  TripleDAG->exitRegion();
+  TripleDAG->finishBlock();
+  restoreMBB();
+}
+
+void WindowScheduler::backupMBB() {
+  for (auto &MI : MBB->instrs())
+    OriMIs.push_back(&MI);
+  // Remove MIs and the corresponding live intervals.
+  for (auto &MI : make_early_inc_range(*MBB)) {
+    Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+    MBB->remove(&MI);
+  }
+}
+
+void WindowScheduler::restoreMBB() {
+  // Erase MIs and the corresponding live intervals.
+  for (auto &MI : make_early_inc_range(*MBB)) {
+    Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+    MI.eraseFromParent();
+  }
+  // Restore MBB to the state before window scheduling.
+  for (auto *MI : OriMIs)
+    MBB->push_back(MI);
+  updateLiveIntervals();
+}
+
+void WindowScheduler::generateTripleMBB() {
+  const unsigned DuplicateNum = 3;
+  TriMIs.clear();
+  TriToOri.clear();
+  assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
+  // Step 1: Performing the first copy of MBB instructions, excluding
+  // terminators. At the same time, we back up the anti-register of phis.
+  // DefPairs hold the old and new define register pairs.
+  DenseMap<Register, Register> DefPairs;
+  for (auto *MI : OriMIs) {
+    if (MI->isMetaInstruction() || MI->isTerminator())
+      continue;
+    if (MI->isPHI())
+      if (Register AntiReg = getAntiRegister(MI))
+        DefPairs[MI->getOperand(0).getReg()] = AntiReg;
+    auto *NewMI = MF->CloneMachineInstr(MI);
+    MBB->push_back(NewMI);
+    TriMIs.push_back(NewMI);
+    TriToOri[NewMI] = MI;
+  }
+  // Step 2: Performing the remaining two copies of MBB instructions excluding
+  // phis, and the last one contains terminators. At the same time, registers
+  // are updated accordingly.
+  for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
+    for (auto *MI : OriMIs) {
+      if (MI->isPHI() || MI->isMetaInstruction() ||
+          (MI->isTerminator() && Cnt < DuplicateNum - 1))
+        continue;
+      auto *NewMI = MF->CloneMachineInstr(MI);
+      DenseMap<Register, Register> NewDefs;
+      // New defines are updated.
+      for (auto MO : NewMI->all_defs())
+        if (MO.isReg() && MO.getReg().isVirtual()) {
+          Register NewDef =
+              MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
+          NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
+          NewDefs[MO.getReg()] = NewDef;
+        }
+      // New uses are updated.
+      for (auto DefRegPair : DefPairs)
+        if (NewMI->readsRegister(DefRegPair.first, TRI)) {
+          Register NewUse = DefRegPair.second;
+          // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
+          //
+          // BB.3:                                  DefPairs
+          // ==================================
+          // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]  (%1,%7)
+          // ...
+          // ==================================
+          // ...
+          // %4 = sub i32 %1, %3
+          // ...
+          // %7 = add i32 %5, %6
+          // ...
+          // ----------------------------------
+          // ...
+          // %8 = sub i32 %7, %3                    (%1,%7),(%4,%8)
+          // ...
+          // %9 = add i32 %5, %6                    (%1,%7),(%4,%8),(%7,%9)
+          // ...
+          // ----------------------------------
+          // ...
+          // %10 = sub i32 %9, %3                   (%1,%7),(%4,%10),(%7,%9)
+          // ...            ^
+          // %11 = add i32 %5, %6                   (%1,%7),(%4,%10),(%7,%11)
+          // ...
+          // ==================================
+          //          < Terminators >
+          // ==================================
+          if (DefPairs.count(NewUse))
+            NewUse = DefPairs[NewUse];
+          NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
+        }
+      // DefPairs is updated at last.
+      for (auto &NewDef : NewDefs)
+        DefPairs[NewDef.first] = NewDef.second;
+      MBB->push_back(NewMI);
+      TriMIs.push_back(NewMI);
+      TriToOri[NewMI] = MI;
+    }
+  }
+  // Step 3: The registers used by phis are updated, and they are generated in
+  // the third copy of MBB.
+  // In the privious example, the old phi is:
+  // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
+  // The new phi is:
+  // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
+  for (auto &Phi : MBB->phis()) {
+    for (auto DefRegPair : DefPairs)
+      if (Phi.readsRegister(DefRegPair.first, TRI))
+        Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
+  }
+  updateLiveIntervals();
+}
+
+void WindowScheduler::restoreTripleMBB() {
+  // After list scheduling, the MBB is restored in one traversal.
+  for (size_t I = 0; I < TriMIs.size(); ++I) {
+    auto *MI = TriMIs[I];
+    auto OldPos = MBB->begin();
+    std::advance(OldPos, I);
+    auto CurPos = MI->getIterator();
+    if (CurPos != OldPos) {
+      MBB->splice(OldPos, MBB, CurPos);
+      Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
+    }
+  }
+}
+
+SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
+                                                        unsigned SearchRatio) {
+  // We use SearchRatio to get the index range, and then evenly get the indexes
+  // according to the SearchNum. This is a simple huristic. Depending on the
+  // characteristics of the target, more complex algorithms can be used for both
+  // performance and compilation time.
+  assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
+  unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
+  unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
+  SmallVector<unsigned> SearchIndexes;
+  for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
+    SearchIndexes.push_back(Idx);
+  return SearchIndexes;
+}
+
+int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
+  // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
+  unsigned MaxDepth = 1;
+  for (auto &SU : DAG.SUnits)
+    MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
+  return MaxDepth * WindowIICoeff;
+}
+
+int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
+                                       unsigned Offset) {
+  int InitII = getEstimatedII(DAG);
+  ResourceManager RM(Subtarget, &DAG);
+  RM.init(InitII);
+  // ResourceManager and DAG are used to calculate the maximum cycle for the
+  // scheduled MIs. Since MIs in the Region have already been scheduled, the
+  // emit cycles can be estimated in order here.
+  int CurCycle = 0;
+  auto Range = getScheduleRange(Offset, SchedInstrNum);
+  for (auto &MI : Range) {
+    auto *SU = DAG.getSUnit(&MI);
+    int ExpectCycle = CurCycle;
+    // The predecessors of current MI determine its earliest issue cycle.
+    for (auto &Pred : SU->Preds) {
+      auto *PredMI = Pred.getSUnit()->getInstr();
+      int PredCycle = getOriCycle(PredMI);
+      ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
+    }
+    // ResourceManager can be used to detect resource conflicts between the
+    // current MI and the previously inserted MIs.
+    while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
+      ++CurCycle;
+      if (CurCycle == (int)WindowIILimit)
+        return CurCycle;
+    }
+    RM.reserveResources(*SU, CurCycle);
+    OriToCycle[getOriMI(&MI)] = CurCycle;
+    LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
+                      << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
+  }
+  LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
+  return CurCycle;
+}
+
+// By utilizing TripleDAG, we can easily establish dependencies between A and B.
+// Based on the MaxCycle and the issue cycle of A and B, we can determine
+// whether it is necessary to add a stall cycle. This is because, without
+// inserting the stall cycle, the latency constraint between A and B cannot be
+// satisfied. The details are as follows:
+//
+// New MBB:
+// ========================================
+//                 < Phis >
+// ========================================     (sliding direction)
+// MBB copy 1                                            |
+//                                                       V
+//
+// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~  ----schedule window-----
+//                    |                                  |
+// ===================V====================              |
+// MBB copy 2      < MI B >                              |
+//                                                       |
+//                 < MI A >                              V
+// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~  ------------------------
+//                    :
+// ===================V====================
+// MBB copy 3      < MI B'>
+//
+//
+//
+//
+// ========================================
+//              < Terminators >
+// ========================================
+int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
+  int MaxStallCycle = 0;
+  auto Range = getScheduleRange(Offset, SchedInstrNum);
+  for (auto &MI : Range) {
+    auto *SU = TripleDAG->getSUnit(&MI);
+    int DefCycle = getOriCycle(&MI);
+    for (auto &Succ : SU->Succs) {
+      if (Succ.getSUnit() == &TripleDAG->ExitSU)
+        continue;
+      // If the expected cycle does not exceed MaxCycle, no check is needed.
+      if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+        continue;
+      // If the cycle of the scheduled MI A is less than that of the scheduled
+      // MI B, the scheduling will fail because the lifetime of the
+      // corresponding register exceeds II.
+      auto *SuccMI = Succ.getSUnit()->getInstr();
+      int UseCycle = getOriCycle(SuccMI);
+      if (DefCycle < UseCycle)
+        return WindowIILimit;
+      // Get the stall cycle introduced by the register between two trips.
+      int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+      MaxStallCycle = std::max(MaxStallCycle, StallCycle);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
+  return MaxStallCycle;
+}
+
+unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
+  LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
+  int MaxCycle = calculateMaxCycle(DAG, Offset);
+  if (MaxCycle == (int)WindowIILimit)
+    return MaxCycle;
+  int StallCycle = calculateStallCycle(Offset, MaxCycle);
+  if (StallCycle == (int)WindowIILimit)
+    return StallCycle;
+  // The value of II is equal to the maximum execution cycle plus 1.
+  return MaxCycle + StallCycle + 1;
+}
+
+void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
+  LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
+  for (auto &Phi : MBB->phis()) {
+    int LateCycle = INT_MAX;
+    auto *SU = TripleDAG->getSUnit(&Phi);
+    for (auto &Succ : SU->Succs) {
+      // Phi doesn't have any Anti successors.
+      if (Succ.getKind() != SDep::Data)
+        continue;
+      // Phi is scheduled before the successor of stage 0. The issue cycle of
+      // phi is the latest cycle in this interval.
+      auto *SuccMI = Succ.getSUnit()->getInstr();
+      int Cycle = getOriCycle(SuccMI);
+      if (getOriStage(getOriMI(SuccMI), Offset) == 0)
+        LateCycle = std::min(LateCycle, Cycle);
+    }
+    // The anti-dependency of phi need to be handled separately in the same way.
+    if (Register AntiReg = getAntiRegister(&Phi)) {
+      auto *AntiMI = MRI->getVRegDef(AntiReg);
+      auto AntiCycle = getOriCycle(AntiMI);
+      if (getOriStage(getOriMI(AntiMI), Offset) == 0)
+        LateCycle = std::min(LateCycle, AntiCycle);
+    }
+    // If there is no limit to the late cycle, a default value is given.
+    if (LateCycle == INT_MAX)
+      LateCycle = (int)(II - 1);
+    LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
+    // The issue cycle of phi is set to the latest cycle in the interval.
+    auto *OriPhi = getOriMI(&Phi);
+    OriToCycle[OriPhi] = LateCycle;
+  }
+}
+
+DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
+                                                             unsigned II) {
+  // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
+  // way is to put phi at the beginning of the current cycle.
+  DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
+  auto Range = getScheduleRange(Offset, SchedInstrNum);
+  for (auto &Phi : MBB->phis())
+    CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
+  for (auto &MI : Range)
+    CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
+  // Each MI is assigned a separate ordered Id, which is used as a sort marker
+  // in the following expand process.
+  DenseMap<MachineInstr *, int> IssueOrder;
+  int Id = 0;
+  for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
+    if (!CycleToMIs.count(Cycle))
+      continue;
+    for (auto *MI : CycleToMIs[Cycle])
+      IssueOrder[MI] = Id++;
+  }
+  return IssueOrder;
+}
+
+void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
+  // At the first update, Offset is equal to SchedPhiNum. At this time, only
+  // BestII, BestOffset, and BaseII need to be updated.
+  if (Offset == SchedPhiNum) {
+    BestII = II;
+    BestOffset = SchedPhiNum;
+    BaseII = II;
+    return;
+  }
+  // The update will only continue if the II is smaller than BestII and the II
+  // is sufficiently small.
+  if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
+    return;
+  BestII = II;
+  BestOffset = Offset;
+  // Record the result of the current list scheduling, noting that each MI is
+  // stored unordered in SchedResult.
+  SchedResult.clear();
+  auto IssueOrder = getIssueOrder(Offset, II);
+  for (auto &Pair : OriToCycle) {
+    assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
+    SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
+                                          getOriStage(Pair.first, Offset),
+                                          IssueOrder[Pair.first]));
+  }
+}
+
+void WindowScheduler::expand() {
+  // The MIs in the SchedResult are sorted by the issue order ID.
+  llvm::stable_sort(SchedResult,
+                    [](const std::tuple<MachineInstr *, int, int, int> &A,
+                       const std::tuple<MachineInstr *, int, int, int> &B) {
+                      return std::get<3>(A) < std::get<3>(B);
+                    });
+  // Use the scheduling infrastructure for expansion, noting that InstrChanges
+  // is not supported here.
+  DenseMap<MachineInstr *, int> Cycles, Stages;
+  std::vector<MachineInstr *> OrderedInsts;
+  for (auto &Info : SchedResult) {
+    auto *MI = std::get<0>(Info);
+    OrderedInsts.push_back(MI);
+    Cycles[MI] = std::get<1>(Info);
+    Stages[MI] = std::get<2>(Info);
+    LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
+                      << "]: " << *MI);
+  }
+  ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
+                    std::move(Stages));
+  ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
+                             ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MSE.cleanup();
+}
+
+void WindowScheduler::updateLiveIntervals() {
+  SmallVector<Register, 128> UsedRegs;
+  for (MachineInstr &MI : *MBB)
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;
+      Register Reg = MO.getReg();
+      if (!is_contained(UsedRegs, Reg))
+        UsedRegs.push_back(Reg);
+    }
+  Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
+}
+
+iterator_range<MachineBasicBlock::iterator>
+WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
+  auto RegionBegin = MBB->begin();
+  std::advance(RegionBegin, Offset);
+  auto RegionEnd = RegionBegin;
+  std::advance(RegionEnd, Num);
+  return make_range(RegionBegin, RegionEnd);
+}
+
+int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
+  assert(TriToOri.count(NewMI) && "Cannot find original MI!");
+  auto *OriMI = TriToOri[NewMI];
+  assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
+  return OriToCycle[OriMI];
+}
+
+MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
+  assert(TriToOri.count(NewMI) && "Cannot find original MI!");
+  return TriToOri[NewMI];
+}
+
+unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
+  assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
+         "Cannot find OriMI in OriMIs!");
+  // If there is no instruction fold, all MI stages are 0.
+  if (Offset == SchedPhiNum)
+    return 0;
+  // For those MIs with an ID less than the Offset, their stages are set to 0,
+  // while the rest are set to 1.
+  unsigned Id = 0;
+  for (auto *MI : OriMIs) {
+    if (MI->isMetaInstruction())
+      continue;
+    if (MI == OriMI)
+      break;
+    ++Id;
+  }
+  return Id >= (size_t)Offset ? 1 : 0;
+}
+
+Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
+  assert(Phi->isPHI() && "Expecting PHI!");
+  Register AntiReg;
+  for (auto MO : Phi->uses()) {
+    if (MO.isReg())
+      AntiReg = MO.getReg();
+    else if (MO.isMBB() && MO.getMBB()->getNumber() == MBB->getNumber())
+      return AntiReg;
+  }
+  return 0;
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir
new file mode 100644
index 0000000000000..b1549e39c910f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-dead-def.mir
@@ -0,0 +1,131 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+# CHECK-LABEL: name: exp_approx_top_six
+# CHECK: bb.5.loop_body:
+# CHECK: dead %{{[0-9]*}}:hvxvr = V6_vaddw %{{[0-9]*}}, %{{[0-9]*}}
+# CHECK: ENDLOOP0
+# CHECK: bb.6:
+
+--- |
+  define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 {
+  entry:
+    %is_zero = icmp eq i32 %N, 0
+    br i1 %is_zero, label %exit, label %loop_header
+
+  loop_header:
+    %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595)
+    %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379)
+    %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201)
+    %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993)
+    br label %loop_body
+
+  exit:
+    ret void
+
+  loop_body:
+    %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ]
+    %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ]
+    %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ]
+    %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input)
+    %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input)
+    %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input)
+    %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input)
+    %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input)
+    %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2)
+    %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3)
+    %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4)
+    %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5)
+    %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6)
+    %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input)
+    %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1)
+    %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2)
+    %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3)
+    %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4)
+    %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5)
+    store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128
+    %index_next = add nuw i32 %index, 32
+    %loop_cond = icmp ult i32 %index_next, %N
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %loop_cond, label %loop_body, label %exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+...
+---
+name:            exp_approx_top_six
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.loop_header:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1065353216
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1056964608
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1042983595
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_tfrsi 1026206379
+    %11:hvxvr = V6_lvsplatw killed %10
+    %12:intregs = A2_tfrsi 1007192201
+    %13:hvxvr = V6_lvsplatw killed %12
+    %14:intregs = A2_tfrsi 985008993
+    %15:hvxvr = V6_lvsplatw killed %14
+    %16:intregs = A2_addi %2, 31
+    %17:intregs = S2_lsr_i_r %16, 5
+    %18:intregs = COPY %17
+    J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3.loop_body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %19:intregs = PHI %1, %bb.1, %20, %bb.3
+    %21:intregs = PHI %0, %bb.1, %22, %bb.3
+    %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1)
+    %24:hvxvr = V6_vmpyowh_rnd %23, %23
+    %25:hvxvr = V6_vmpyowh_rnd %24, %23
+    %26:hvxvr = V6_vmpyowh_rnd %25, %23
+    %27:hvxvr = V6_vmpyowh_rnd %26, %23
+    %28:hvxvr = V6_vmpyowh_rnd %27, %23
+    %29:hvxvr = V6_vmpyowh_rnd %7, %24
+    %30:hvxvr = V6_vmpyowh_rnd %9, %25
+    %31:hvxvr = V6_vmpyowh_rnd %11, %26
+    %32:hvxvr = V6_vmpyowh_rnd %13, %27
+    %33:hvxvr = V6_vmpyowh_rnd %15, killed %28
+    %34:hvxvr = V6_vaddw %5, %23
+    %35:hvxvr = V6_vaddw killed %34, killed %29
+    %36:hvxvr = V6_vaddw killed %35, killed %30
+    %37:hvxvr = V6_vaddw killed %36, killed %31
+    %38:hvxvr = V6_vaddw killed %37, killed %32
+    %39:hvxvr = V6_vaddw %38, %33
+    ; To check the dead virtual register within loop kernel.
+    dead %40:hvxvr = V6_vaddw killed %38, killed %33
+    %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv)
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir
new file mode 100644
index 0000000000000..b62cbbbb8af4e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp-dbg.mir
@@ -0,0 +1,310 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+#
+# The Window Scheduling algorithm will discard the debug IR, just like the SMS
+# algorithm does. Additionally, the MMO information in the IR is also preserved
+# to ensure that no barrier dependencies are generated within the loop body.
+#
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+# CHECK-LABEL: name: exp_approx
+# CHECK: bb.5.for.body:
+# CHECK-NOT: DBG_VALUE
+# CHECK: ENDLOOP0
+# CHECK: bb.6:
+
+
+--- |
+  define void @exp_approx(i32 %num_elements, ptr noalias %input_ptr, ptr noalias %output_ptr) #0 !dbg !20 {
+  entry:
+    tail call void @llvm.dbg.value(metadata i32 %num_elements, metadata !33, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata ptr %input_ptr, metadata !34, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata ptr %output_ptr, metadata !35, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 1.000000e+00, metadata !36, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 5.000000e-01, metadata !37, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 0x3FC5555560000000, metadata !38, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 0x3FA5555560000000, metadata !39, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 0x3F81111120000000, metadata !40, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata float 0x3F56C16C20000000, metadata !41, metadata !DIExpression()), !dbg !69
+    tail call void @llvm.dbg.value(metadata i32 0, metadata !42, metadata !DIExpression()), !dbg !70
+    %is_zero_elements = icmp eq i32 %num_elements, 0, !dbg !71
+    br i1 %is_zero_elements, label %for.cond.cleanup, label %for.body.lr.ph, !dbg !72
+  
+  for.body.lr.ph:
+    %const_1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %const_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %const_third = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595)
+    %const_quarter = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379)
+    %const_fifth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201)
+    %const_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993)
+    br label %for.body, !dbg !72
+  
+  for.cond.cleanup:
+    ret void, !dbg !73
+  
+  for.body:
+    %lsr.iv1 = phi ptr [ %cgep3, %for.body ], [ %input_ptr, %for.body.lr.ph ]
+    %lsr.iv = phi ptr [ %cgep, %for.body ], [ %output_ptr, %for.body.lr.ph ]
+    %index = phi i32 [ 0, %for.body.lr.ph ], [ %next_index, %for.body ]
+    tail call void @llvm.dbg.value(metadata i32 %index, metadata !42, metadata !DIExpression()), !dbg !70
+    %input_values = load <32 x i32>, ptr %lsr.iv1, align 128, !dbg !74
+    tail call void @llvm.dbg.value(metadata <32 x i32> %input_values, metadata !44, metadata !DIExpression()), !dbg !75
+    %squared_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %input_values, <32 x i32> %input_values), !dbg !76
+    tail call void @llvm.dbg.value(metadata <32 x i32> %squared_values, metadata !47, metadata !DIExpression()), !dbg !75
+    %cubed_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %squared_values, <32 x i32> %input_values), !dbg !77
+    tail call void @llvm.dbg.value(metadata <32 x i32> %cubed_values, metadata !48, metadata !DIExpression()), !dbg !75
+    %quartic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %cubed_values, <32 x i32> %input_values), !dbg !78
+    tail call void @llvm.dbg.value(metadata <32 x i32> %quartic_values, metadata !49, metadata !DIExpression()), !dbg !75
+    %quintic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %quartic_values, <32 x i32> %input_values), !dbg !79
+    tail call void @llvm.dbg.value(metadata <32 x i32> %quintic_values, metadata !50, metadata !DIExpression()), !dbg !75
+    %sextic_values = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %quintic_values, <32 x i32> %input_values), !dbg !80
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sextic_values, metadata !51, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_1, metadata !52, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_half, metadata !53, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_third, metadata !54, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_quarter, metadata !55, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_fifth, metadata !56, metadata !DIExpression()), !dbg !75
+    tail call void @llvm.dbg.value(metadata <32 x i32> %const_sixth, metadata !57, metadata !DIExpression()), !dbg !75
+    %product_half = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_half, <32 x i32> %squared_values), !dbg !81
+    tail call void @llvm.dbg.value(metadata <32 x i32> %product_half, metadata !58, metadata !DIExpression()), !dbg !75
+    %product_third = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_third, <32 x i32> %cubed_values), !dbg !82
+    tail call void @llvm.dbg.value(metadata <32 x i32> %product_third, metadata !59, metadata !DIExpression()), !dbg !75
+    %product_quarter = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_quarter, <32 x i32> %quartic_values), !dbg !83
+    tail call void @llvm.dbg.value(metadata <32 x i32> %product_quarter, metadata !60, metadata !DIExpression()), !dbg !75
+    %product_fifth = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_fifth, <32 x i32> %quintic_values), !dbg !84
+    tail call void @llvm.dbg.value(metadata <32 x i32> %product_fifth, metadata !61, metadata !DIExpression()), !dbg !75
+    %product_sixth = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %const_sixth, <32 x i32> %sextic_values), !dbg !85
+    tail call void @llvm.dbg.value(metadata <32 x i32> %product_sixth, metadata !62, metadata !DIExpression()), !dbg !75
+    %sum_1_input = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %const_1, <32 x i32> %input_values), !dbg !86
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sum_1_input, metadata !63, metadata !DIExpression()), !dbg !75
+    %sum_half = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_1_input, <32 x i32> %product_half), !dbg !87
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sum_half, metadata !64, metadata !DIExpression()), !dbg !75
+    %sum_third = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_half, <32 x i32> %product_third), !dbg !88
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sum_third, metadata !65, metadata !DIExpression()), !dbg !75
+    %sum_quarter = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_third, <32 x i32> %product_quarter), !dbg !89
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sum_quarter, metadata !66, metadata !DIExpression()), !dbg !75
+    %sum_fifth = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_quarter, <32 x i32> %product_fifth), !dbg !90
+    tail call void @llvm.dbg.value(metadata <32 x i32> %sum_fifth, metadata !67, metadata !DIExpression()), !dbg !75
+    %final_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %sum_fifth, <32 x i32> %product_sixth), !dbg !91
+    tail call void @llvm.dbg.value(metadata <32 x i32> %final_result, metadata !68, metadata !DIExpression()), !dbg !75
+    store <32 x i32> %final_result, ptr %lsr.iv, align 128, !dbg !92
+    %next_index = add nuw i32 %index, 32, !dbg !93
+    tail call void @llvm.dbg.value(metadata i32 %next_index, metadata !42, metadata !DIExpression()), !dbg !70
+    %continue_loop = icmp ult i32 %next_index, %num_elements, !dbg !71
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %continue_loop, label %for.body, label %for.cond.cleanup, !dbg !72, !llvm.loop !94
+  }
+  
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+  
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv68,+v66,-long-calls" }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!13, !14, !15, !16, !17, !18, !19}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "exp_approx.cpp", directory: "/tmp")
+  !2 = !{!3, !10}
+  !3 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+  !4 = !DIDerivedType(tag: DW_TAG_typedef, name: "HVX_Vector", file: !1, line: 11, baseType: !5)
+  !5 = !DIDerivedType(tag: DW_TAG_typedef, name: "Vect1024", file: !1, line: 4, baseType: !6, align: 1024)
+  !6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 1024, flags: DIFlagVector, elements: !8)
+  !7 = !DIBasicType(name: "long", size: 32, encoding: DW_ATE_signed)
+  !8 = !{!9}
+  !9 = !DISubrange(count: 32)
+  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32)
+  !11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+  !12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !13 = !{i32 7, !"Dwarf Version", i32 5}
+  !14 = !{i32 2, !"Debug Info Version", i32 3}
+  !15 = !{i32 1, !"wchar_size", i32 4}
+  !16 = !{i32 8, !"PIC Level", i32 2}
+  !17 = !{i32 7, !"PIE Level", i32 2}
+  !18 = !{i32 7, !"frame-pointer", i32 2}
+  !19 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+  !20 = distinct !DISubprogram(name: "exp_approx", linkageName: "exp_approx", scope: !1, file: !1, line: 15, type: !21, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !32)
+  !21 = !DISubroutineType(types: !22)
+  !22 = !{null, !23, !26, !30}
+  !23 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !24, line: 13, baseType: !25)
+  !24 = !DIFile(filename: "__stddef_size_t.h", directory: "/tmp")
+  !25 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  !26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !27)
+  !27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 32)
+  !28 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !29)
+  !29 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  !30 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !31)
+  !31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !29, size: 32)
+  !32 = !{!33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !44, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68}
+  !33 = !DILocalVariable(name: "N", arg: 1, scope: !20, file: !1, line: 15, type: !23)
+  !34 = !DILocalVariable(name: "x", arg: 2, scope: !20, file: !1, line: 15, type: !26)
+  !35 = !DILocalVariable(name: "y", arg: 3, scope: !20, file: !1, line: 15, type: !30)
+  !36 = !DILocalVariable(name: "a", scope: !20, file: !1, line: 16, type: !28)
+  !37 = !DILocalVariable(name: "b", scope: !20, file: !1, line: 17, type: !28)
+  !38 = !DILocalVariable(name: "c", scope: !20, file: !1, line: 18, type: !28)
+  !39 = !DILocalVariable(name: "d", scope: !20, file: !1, line: 19, type: !28)
+  !40 = !DILocalVariable(name: "e", scope: !20, file: !1, line: 20, type: !28)
+  !41 = !DILocalVariable(name: "f", scope: !20, file: !1, line: 21, type: !28)
+  !42 = !DILocalVariable(name: "i", scope: !43, file: !1, line: 25, type: !23)
+  !43 = distinct !DILexicalBlock(scope: !20, file: !1, line: 25, column: 3)
+  !44 = !DILocalVariable(name: "v_x1", scope: !45, file: !1, line: 26, type: !4)
+  !45 = distinct !DILexicalBlock(scope: !46, file: !1, line: 25, column: 38)
+  !46 = distinct !DILexicalBlock(scope: !43, file: !1, line: 25, column: 3)
+  !47 = !DILocalVariable(name: "v_x2", scope: !45, file: !1, line: 28, type: !4)
+  !48 = !DILocalVariable(name: "v_x3", scope: !45, file: !1, line: 29, type: !4)
+  !49 = !DILocalVariable(name: "v_x4", scope: !45, file: !1, line: 30, type: !4)
+  !50 = !DILocalVariable(name: "v_x5", scope: !45, file: !1, line: 31, type: !4)
+  !51 = !DILocalVariable(name: "v_x6", scope: !45, file: !1, line: 32, type: !4)
+  !52 = !DILocalVariable(name: "v_a", scope: !45, file: !1, line: 33, type: !4)
+  !53 = !DILocalVariable(name: "v_b", scope: !45, file: !1, line: 34, type: !4)
+  !54 = !DILocalVariable(name: "v_c", scope: !45, file: !1, line: 35, type: !4)
+  !55 = !DILocalVariable(name: "v_d", scope: !45, file: !1, line: 36, type: !4)
+  !56 = !DILocalVariable(name: "v_e", scope: !45, file: !1, line: 37, type: !4)
+  !57 = !DILocalVariable(name: "v_f", scope: !45, file: !1, line: 38, type: !4)
+  !58 = !DILocalVariable(name: "v_1", scope: !45, file: !1, line: 39, type: !4)
+  !59 = !DILocalVariable(name: "v_2", scope: !45, file: !1, line: 40, type: !4)
+  !60 = !DILocalVariable(name: "v_3", scope: !45, file: !1, line: 41, type: !4)
+  !61 = !DILocalVariable(name: "v_4", scope: !45, file: !1, line: 42, type: !4)
+  !62 = !DILocalVariable(name: "v_5", scope: !45, file: !1, line: 43, type: !4)
+  !63 = !DILocalVariable(name: "vr_1", scope: !45, file: !1, line: 44, type: !4)
+  !64 = !DILocalVariable(name: "vr_2", scope: !45, file: !1, line: 45, type: !4)
+  !65 = !DILocalVariable(name: "vr_3", scope: !45, file: !1, line: 46, type: !4)
+  !66 = !DILocalVariable(name: "vr_4", scope: !45, file: !1, line: 47, type: !4)
+  !67 = !DILocalVariable(name: "vr_5", scope: !45, file: !1, line: 48, type: !4)
+  !68 = !DILocalVariable(name: "result", scope: !45, file: !1, line: 49, type: !4)
+  !69 = !DILocation(line: 0, scope: !20)
+  !70 = !DILocation(line: 0, scope: !43)
+  !71 = !DILocation(line: 25, column: 24, scope: !46)
+  !72 = !DILocation(line: 25, column: 3, scope: !43)
+  !73 = !DILocation(line: 53, column: 1, scope: !20)
+  !74 = !DILocation(line: 26, column: 22, scope: !45)
+  !75 = !DILocation(line: 0, scope: !45)
+  !76 = !DILocation(line: 28, column: 23, scope: !45)
+  !77 = !DILocation(line: 29, column: 23, scope: !45)
+  !78 = !DILocation(line: 30, column: 23, scope: !45)
+  !79 = !DILocation(line: 31, column: 23, scope: !45)
+  !80 = !DILocation(line: 32, column: 23, scope: !45)
+  !81 = !DILocation(line: 39, column: 22, scope: !45)
+  !82 = !DILocation(line: 40, column: 22, scope: !45)
+  !83 = !DILocation(line: 41, column: 22, scope: !45)
+  !84 = !DILocation(line: 42, column: 22, scope: !45)
+  !85 = !DILocation(line: 43, column: 22, scope: !45)
+  !86 = !DILocation(line: 44, column: 23, scope: !45)
+  !87 = !DILocation(line: 45, column: 23, scope: !45)
+  !88 = !DILocation(line: 46, column: 23, scope: !45)
+  !89 = !DILocation(line: 47, column: 23, scope: !45)
+  !90 = !DILocation(line: 48, column: 23, scope: !45)
+  !91 = !DILocation(line: 49, column: 25, scope: !45)
+  !92 = !DILocation(line: 51, column: 31, scope: !45)
+  !93 = !DILocation(line: 25, column: 31, scope: !46)
+  !94 = distinct !{!94, !72, !95}
+  !95 = !DILocation(line: 52, column: 3, scope: !43)
+
+...
+---
+name:            exp_approx
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+  
+    DBG_VALUE $r0, $noreg, !33, !DIExpression(), debug-location !69
+    DBG_VALUE $r1, $noreg, !34, !DIExpression(), debug-location !69
+    DBG_VALUE $r2, $noreg, !35, !DIExpression(), debug-location !69
+    %0:intregs = COPY $r2
+    DBG_VALUE %0, $noreg, !35, !DIExpression(), debug-location !69
+    %1:intregs = COPY $r1
+    DBG_VALUE %1, $noreg, !34, !DIExpression(), debug-location !69
+    %2:intregs = COPY $r0
+    DBG_VALUE %2, $noreg, !33, !DIExpression(), debug-location !69
+    DBG_VALUE float 1.000000e+00, $noreg, !36, !DIExpression(), debug-location !69
+    DBG_VALUE float 5.000000e-01, $noreg, !37, !DIExpression(), debug-location !69
+    DBG_VALUE float 0x3FC5555560000000, $noreg, !38, !DIExpression(), debug-location !69
+    DBG_VALUE float 0x3FA5555560000000, $noreg, !39, !DIExpression(), debug-location !69
+    DBG_VALUE float 0x3F81111120000000, $noreg, !40, !DIExpression(), debug-location !69
+    DBG_VALUE float 0x3F56C16C20000000, $noreg, !41, !DIExpression(), debug-location !69
+    DBG_VALUE 0, $noreg, !42, !DIExpression(), debug-location !70
+    %3:predregs = C2_cmpeqi %2, 0, debug-location !71
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc, debug-location !72
+    J2_jump %bb.1, implicit-def dead $pc, debug-location !72
+  
+  bb.1.for.body.lr.ph:
+    successors: %bb.3(0x80000000)
+  
+    %4:intregs = A2_tfrsi 1065353216
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1056964608
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1042983595
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_tfrsi 1026206379
+    %11:hvxvr = V6_lvsplatw killed %10
+    %12:intregs = A2_tfrsi 1007192201
+    %13:hvxvr = V6_lvsplatw killed %12
+    %14:intregs = A2_tfrsi 985008993
+    %15:hvxvr = V6_lvsplatw killed %14
+    %16:intregs = A2_addi %2, 31, debug-location !72
+    %17:intregs = S2_lsr_i_r %16, 5, debug-location !72
+    %18:intregs = COPY %17, debug-location !72
+    J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr, debug-location !72
+    J2_jump %bb.3, implicit-def dead $pc, debug-location !72
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc, debug-location !73
+  
+  bb.3.for.body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+  
+    %19:intregs = PHI %1, %bb.1, %20, %bb.3
+    %21:intregs = PHI %0, %bb.1, %22, %bb.3
+    DBG_VALUE %23:intregs, $noreg, !42, !DIExpression(), debug-location !70
+    %24:hvxvr, %20:intregs = V6_vL32b_pi %19, 128, debug-location !74 :: (load (s1024) from %ir.lsr.iv1)
+    DBG_VALUE %24, $noreg, !44, !DIExpression(), debug-location !75
+    %25:hvxvr = V6_vmpyowh_rnd %24, %24, debug-location !76
+    DBG_VALUE %25, $noreg, !47, !DIExpression(), debug-location !75
+    %26:hvxvr = V6_vmpyowh_rnd %25, %24, debug-location !77
+    DBG_VALUE %26, $noreg, !48, !DIExpression(), debug-location !75
+    %27:hvxvr = V6_vmpyowh_rnd %26, %24, debug-location !78
+    DBG_VALUE %27, $noreg, !49, !DIExpression(), debug-location !75
+    %28:hvxvr = V6_vmpyowh_rnd %27, %24, debug-location !79
+    DBG_VALUE %28, $noreg, !50, !DIExpression(), debug-location !75
+    %29:hvxvr = V6_vmpyowh_rnd %28, %24, debug-location !80
+    DBG_VALUE %29, $noreg, !51, !DIExpression(), debug-location !75
+    DBG_VALUE %5, $noreg, !52, !DIExpression(), debug-location !75
+    DBG_VALUE %7, $noreg, !53, !DIExpression(), debug-location !75
+    DBG_VALUE %9, $noreg, !54, !DIExpression(), debug-location !75
+    DBG_VALUE %11, $noreg, !55, !DIExpression(), debug-location !75
+    DBG_VALUE %13, $noreg, !56, !DIExpression(), debug-location !75
+    DBG_VALUE %15, $noreg, !57, !DIExpression(), debug-location !75
+    %30:hvxvr = V6_vmpyowh_rnd %7, %25, debug-location !81
+    DBG_VALUE %30, $noreg, !58, !DIExpression(), debug-location !75
+    %31:hvxvr = V6_vmpyowh_rnd %9, %26, debug-location !82
+    DBG_VALUE %31, $noreg, !59, !DIExpression(), debug-location !75
+    %32:hvxvr = V6_vmpyowh_rnd %11, %27, debug-location !83
+    DBG_VALUE %32, $noreg, !60, !DIExpression(), debug-location !75
+    %33:hvxvr = V6_vmpyowh_rnd %13, %28, debug-location !84
+    DBG_VALUE %33, $noreg, !61, !DIExpression(), debug-location !75
+    %34:hvxvr = V6_vmpyowh_rnd %15, killed %29, debug-location !85
+    DBG_VALUE %34, $noreg, !62, !DIExpression(), debug-location !75
+    %35:hvxvr = V6_vaddw %5, %24, debug-location !86
+    DBG_VALUE %35, $noreg, !63, !DIExpression(), debug-location !75
+    %36:hvxvr = V6_vaddw killed %35, killed %30, debug-location !87
+    DBG_VALUE %36, $noreg, !64, !DIExpression(), debug-location !75
+    %37:hvxvr = V6_vaddw killed %36, killed %31, debug-location !88
+    DBG_VALUE %37, $noreg, !65, !DIExpression(), debug-location !75
+    %38:hvxvr = V6_vaddw killed %37, killed %32, debug-location !89
+    DBG_VALUE %38, $noreg, !66, !DIExpression(), debug-location !75
+    %39:hvxvr = V6_vaddw killed %38, killed %33, debug-location !90
+    DBG_VALUE %39, $noreg, !67, !DIExpression(), debug-location !75
+    %40:hvxvr = V6_vaddw killed %39, killed %34, debug-location !91
+    DBG_VALUE %40, $noreg, !68, !DIExpression(), debug-location !75
+    %22:intregs = V6_vS32b_pi %21, 128, killed %40, debug-location !92 :: (store (s1024) into %ir.lsr.iv)
+    DBG_VALUE $noreg, $noreg, !42, !DIExpression(), debug-location !70
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0, debug-location !72
+    J2_jump %bb.2, implicit-def dead $pc, debug-location !72
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir
new file mode 100644
index 0000000000000..534c25591f5bf
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-exp.mir
@@ -0,0 +1,124 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+
+--- |
+  define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 {
+  entry:
+    %is_zero = icmp eq i32 %N, 0
+    br i1 %is_zero, label %exit, label %loop_header
+
+  loop_header:
+    %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595)
+    %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379)
+    %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201)
+    %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993)
+    br label %loop_body
+
+  exit:
+    ret void
+
+  loop_body:
+    %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ]
+    %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ]
+    %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ]
+    %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input)
+    %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input)
+    %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input)
+    %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input)
+    %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input)
+    %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2)
+    %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3)
+    %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4)
+    %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5)
+    %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6)
+    %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input)
+    %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1)
+    %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2)
+    %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3)
+    %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4)
+    %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5)
+    store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128
+    %index_next = add nuw i32 %index, 32
+    %loop_cond = icmp ult i32 %index_next, %N
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %loop_cond, label %loop_body, label %exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+...
+---
+name:            exp_approx_top_six
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.loop_header:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1065353216
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1056964608
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1042983595
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_tfrsi 1026206379
+    %11:hvxvr = V6_lvsplatw killed %10
+    %12:intregs = A2_tfrsi 1007192201
+    %13:hvxvr = V6_lvsplatw killed %12
+    %14:intregs = A2_tfrsi 985008993
+    %15:hvxvr = V6_lvsplatw killed %14
+    %16:intregs = A2_addi %2, 31
+    %17:intregs = S2_lsr_i_r %16, 5
+    %18:intregs = COPY %17
+    J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3.loop_body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %19:intregs = PHI %1, %bb.1, %20, %bb.3
+    %21:intregs = PHI %0, %bb.1, %22, %bb.3
+    %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1)
+    %24:hvxvr = V6_vmpyowh_rnd %23, %23
+    %25:hvxvr = V6_vmpyowh_rnd %24, %23
+    %26:hvxvr = V6_vmpyowh_rnd %25, %23
+    %27:hvxvr = V6_vmpyowh_rnd %26, %23
+    %28:hvxvr = V6_vmpyowh_rnd %27, %23
+    %29:hvxvr = V6_vmpyowh_rnd %7, %24
+    %30:hvxvr = V6_vmpyowh_rnd %9, %25
+    %31:hvxvr = V6_vmpyowh_rnd %11, %26
+    %32:hvxvr = V6_vmpyowh_rnd %13, %27
+    %33:hvxvr = V6_vmpyowh_rnd %15, killed %28
+    %34:hvxvr = V6_vaddw %5, %23
+    %35:hvxvr = V6_vaddw killed %34, killed %29
+    %36:hvxvr = V6_vaddw killed %35, killed %30
+    %37:hvxvr = V6_vaddw killed %36, killed %31
+    %38:hvxvr = V6_vaddw killed %37, killed %32
+    %39:hvxvr = V6_vaddw killed %38, killed %33
+    %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv)
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir
new file mode 100644
index 0000000000000..1721419cea9db
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-0.mir
@@ -0,0 +1,56 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \
+# RUN: --check-prefix=CHECK-INITIALIZE
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -window-region-limit=1 -window-ii-limit=1 -o - \
+# RUN: 2>&1 | FileCheck %s --check-prefix=CHECK-ANALYSE-II
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -window-region-limit=1 -window-search-ratio=80 \
+# RUN: -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SCHED-NOT-NEEDED
+
+# CHECK-INITIALIZE: There are too few MIs in the window region!
+# CHECK-INITIALIZE: The WindowScheduler failed to initialize!
+# CHECK-ANALYSE-II: Can't find a valid II. Keep searching...
+# CHECK-ANALYSE-II: Window scheduling is not needed!
+# CHECK-SCHED-NOT-NEEDED: Window scheduling is not needed!
+
+---
+name:            relu
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %4:hvxvr = V6_vd0
+    %5:intregs = A2_addi %2, 31
+    %6:intregs = S2_lsr_i_r %5, 5
+    %7:intregs = COPY %6
+    J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %8:intregs = PHI %1, %bb.1, %9, %bb.3
+    %10:intregs = PHI %0, %bb.1, %11, %bb.3
+    %12:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
+    %13:hvxvr = V6_vmaxw killed %12, %4
+    %11:intregs = V6_vS32b_pi %10, 128, killed %13
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir
new file mode 100644
index 0000000000000..d0521a92585b0
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-1.mir
@@ -0,0 +1,47 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s \
+# RUN: --check-prefix=CHECK-SUCCESSIVE-PHI
+
+# CHECK-SUCCESSIVE-PHI: Consecutive phis are not allowed in window scheduling!
+# CHECK-SUCCESSIVE-PHI: The WindowScheduler failed to initialize!
+
+---
+name:            relu
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %4:hvxvr = V6_vd0
+    %5:intregs = A2_addi %2, 31
+    %6:intregs = S2_lsr_i_r %5, 5
+    %7:intregs = COPY %6
+    J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %8:intregs = PHI %1, %bb.1, %9, %bb.3
+    %10:intregs = PHI %0, %bb.1, %8, %bb.3
+    %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
+    %12:hvxvr = V6_vmaxw killed %11, %4
+    %13:intregs = V6_vS32b_pi %10, 128, killed %12
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
new file mode 100644
index 0000000000000..64229fd8d75cf
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
@@ -0,0 +1,73 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK: The WindowScheduler failed to initialize!
+
+---
+name:            exp_approx_top_six
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1065353216
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1056964608
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1042983595
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_tfrsi 1026206379
+    %11:hvxvr = V6_lvsplatw killed %10
+    %12:intregs = A2_tfrsi 1007192201
+    %13:hvxvr = V6_lvsplatw killed %12
+    %14:intregs = A2_tfrsi 985008993
+    %15:hvxvr = V6_lvsplatw killed %14
+    %16:intregs = A2_addi %2, 31
+    %17:intregs = S2_lsr_i_r %16, 5
+    %18:intregs = COPY %17
+    J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3 (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %19:intregs = PHI %1, %bb.1, %20, %bb.3
+    %21:intregs = PHI %0, %bb.1, %22, %bb.3
+    %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128
+    %24:hvxvr = V6_vmpyowh_rnd %23, %23
+    %25:hvxvr = V6_vmpyowh_rnd %24, %23
+    %26:hvxvr = V6_vmpyowh_rnd %25, %23
+    %27:hvxvr = V6_vmpyowh_rnd %26, %23
+    %28:hvxvr = V6_vmpyowh_rnd %27, %23
+    %29:hvxvr = V6_vmpyowh_rnd %7, %24
+    %30:hvxvr = V6_vmpyowh_rnd %9, %25
+    %31:hvxvr = V6_vmpyowh_rnd %11, %26
+    %32:hvxvr = V6_vmpyowh_rnd %13, %27
+    %33:hvxvr = V6_vmpyowh_rnd %15, killed %28
+    %34:hvxvr = V6_vaddw %5, %23
+    %35:hvxvr = V6_vaddw killed %34, killed %29
+    %36:hvxvr = V6_vaddw killed %35, killed %30
+    %37:hvxvr = V6_vaddw killed %36, killed %31
+    %38:hvxvr = V6_vaddw killed %37, killed %32
+    %39:hvxvr = V6_vaddw killed %38, killed %33
+    ; To check the implicitly defined phiscal register within loop kernel.
+    dead %40:intregs = A2_abssat %18, implicit-def dead $usr_ovf
+    %22:intregs = V6_vS32b_pi %21, 128, killed %39
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir
new file mode 100644
index 0000000000000..f12a72790741c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-meta-instr.mir
@@ -0,0 +1,127 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK-NOT: PSEUDO_PROBE
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+
+--- |
+  define void @exp_approx_top_six(i32 %N, ptr noalias %x, ptr noalias %y) #0 {
+  entry:
+    %is_zero = icmp eq i32 %N, 0
+    br i1 %is_zero, label %exit, label %loop_header
+
+  loop_header:
+    %vec_one = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %vec_half = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %vec_sixth = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1042983595)
+    %vec_24th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1026206379)
+    %vec_120th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1007192201)
+    %vec_720th = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 985008993)
+    br label %loop_body
+
+  exit:
+    ret void
+
+  loop_body:
+    %lsr.iv1 = phi ptr [ %cgep3, %loop_body ], [ %x, %loop_header ]
+    %lsr.iv = phi ptr [ %cgep, %loop_body ], [ %y, %loop_header ]
+    %index = phi i32 [ 0, %loop_header ], [ %index_next, %loop_body ]
+    %vec_input = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_input_pow_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input, <32 x i32> %vec_input)
+    %vec_input_pow_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_2, <32 x i32> %vec_input)
+    %vec_input_pow_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_3, <32 x i32> %vec_input)
+    %vec_input_pow_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_4, <32 x i32> %vec_input)
+    %vec_input_pow_6 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_input_pow_5, <32 x i32> %vec_input)
+    %vec_exp_approx_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_half, <32 x i32> %vec_input_pow_2)
+    %vec_exp_approx_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sixth, <32 x i32> %vec_input_pow_3)
+    %vec_exp_approx_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_24th, <32 x i32> %vec_input_pow_4)
+    %vec_exp_approx_4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_120th, <32 x i32> %vec_input_pow_5)
+    %vec_exp_approx_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_720th, <32 x i32> %vec_input_pow_6)
+    %vec_exp_sum_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_one, <32 x i32> %vec_input)
+    %vec_exp_sum_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_1, <32 x i32> %vec_exp_approx_1)
+    %vec_exp_sum_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_2, <32 x i32> %vec_exp_approx_2)
+    %vec_exp_sum_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_3, <32 x i32> %vec_exp_approx_3)
+    %vec_exp_sum_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_4, <32 x i32> %vec_exp_approx_4)
+    %vec_exp_result = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_exp_sum_5, <32 x i32> %vec_exp_approx_5)
+    store <32 x i32> %vec_exp_result, ptr %lsr.iv, align 128
+    %index_next = add nuw i32 %index, 32
+    %loop_cond = icmp ult i32 %index_next, %N
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %loop_cond, label %loop_body, label %exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+...
+---
+name:            exp_approx_top_six
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.loop_header:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1065353216
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1056964608
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1042983595
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_tfrsi 1026206379
+    %11:hvxvr = V6_lvsplatw killed %10
+    %12:intregs = A2_tfrsi 1007192201
+    %13:hvxvr = V6_lvsplatw killed %12
+    %14:intregs = A2_tfrsi 985008993
+    %15:hvxvr = V6_lvsplatw killed %14
+    %16:intregs = A2_addi %2, 31
+    %17:intregs = S2_lsr_i_r %16, 5
+    %18:intregs = COPY %17
+    J2_loop0r %bb.3, %18, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3.loop_body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %19:intregs = PHI %1, %bb.1, %20, %bb.3
+    %21:intregs = PHI %0, %bb.1, %22, %bb.3
+    %23:hvxvr, %20:intregs = V6_vL32b_pi %19, 128 :: (load (s1024) from %ir.lsr.iv1)
+    %24:hvxvr = V6_vmpyowh_rnd %23, %23
+    %25:hvxvr = V6_vmpyowh_rnd %24, %23
+    %26:hvxvr = V6_vmpyowh_rnd %25, %23
+    %27:hvxvr = V6_vmpyowh_rnd %26, %23
+    %28:hvxvr = V6_vmpyowh_rnd %27, %23
+    %29:hvxvr = V6_vmpyowh_rnd %7, %24
+    %30:hvxvr = V6_vmpyowh_rnd %9, %25
+    %31:hvxvr = V6_vmpyowh_rnd %11, %26
+    %32:hvxvr = V6_vmpyowh_rnd %13, %27
+    ; To check the meta MI within loop kernel.
+    PSEUDO_PROBE 128, 1, 0, 0
+    %33:hvxvr = V6_vmpyowh_rnd %15, killed %28
+    %34:hvxvr = V6_vaddw %5, %23
+    %35:hvxvr = V6_vaddw killed %34, killed %29
+    %36:hvxvr = V6_vaddw killed %35, killed %30
+    %37:hvxvr = V6_vaddw killed %36, killed %31
+    %38:hvxvr = V6_vaddw killed %37, killed %32
+    %39:hvxvr = V6_vaddw killed %38, killed %33
+    %22:intregs = V6_vS32b_pi %21, 128, killed %39 :: (store (s1024) into %ir.lsr.iv)
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir
new file mode 100644
index 0000000000000..1e764d5fa48b4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-sqrt.mir
@@ -0,0 +1,124 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -o - 2>&1 | FileCheck %s
+
+# CHECK: Best window offset is {{[0-9]+}} and Best II is {{[0-9]+}}.
+
+--- |
+  define void @sqrt_approx(i32 noundef %N, ptr noalias %x, ptr noalias %y) #0 {
+  entry:
+    %isZeroLength = icmp eq i32 %N, 0
+    br i1 %isZeroLength, label %loop.exit, label %loop.preheader
+
+  loop.preheader:                                   ; preds = %entry
+    %half_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+    %one_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+    %two_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1073741824)
+    br label %loop.body
+
+  loop.exit:                                        ; preds = %loop.body, %entry
+    ret void
+
+  loop.body:                                        ; preds = %loop.body, %loop.preheader
+    %lsr.iv1 = phi ptr [ %cgep3, %loop.body ], [ %x, %loop.preheader ]
+    %lsr.iv = phi ptr [ %cgep, %loop.body ], [ %y, %loop.preheader ]
+    %index = phi i32 [ 0, %loop.preheader ], [ %index.next, %loop.body ]
+    %vec_x = load <32 x i32>, ptr %lsr.iv1, align 128
+    %vec_sqrt_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %one_splat, <32 x i32> %vec_x)
+    %vec_sqrt_2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_1, <32 x i32> %half_splat)
+    %vec_recip_1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %half_splat)
+    %vec_recip_2 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_1)
+    %vec_y1 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_recip_2)
+    %vec_recip_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_2, <32 x i32> %vec_y1)
+    %vec_recop_4 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_3)
+    %vec_y2 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y1, <32 x i32> %vec_recop_4)
+    %vec_sqrt_3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y2)
+    %vec_sqrt_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y2, <32 x i32> %vec_sqrt_3)
+    %vec_sqrt_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_4, <32 x i32> %half_splat)
+    %vec_recip_5 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %half_splat)
+    %vec_recip_6 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_5)
+    %vec_y3 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_recip_6)
+    %vec_recip_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_5, <32 x i32> %vec_y3)
+    %vec_recop_8 = tail call <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32> %two_splat, <32 x i32> %vec_recip_7)
+    %vec_y4 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_y3, <32 x i32> %vec_recop_8)
+    %vec_sqrt_7 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_x, <32 x i32> %vec_y4)
+    %vec_sqrt_8 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_y4, <32 x i32> %vec_sqrt_7)
+    %vec_sqrt_9 = tail call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32> %vec_sqrt_8, <32 x i32> %half_splat)
+    store <32 x i32> %vec_sqrt_9, ptr %lsr.iv, align 128
+    %index.next = add nuw i32 %index, 32
+    %continue = icmp ult i32 %index.next, %N
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep3 = getelementptr i8, ptr %lsr.iv1, i32 128
+    br i1 %continue, label %loop.body, label %loop.exit
+  }
+
+  declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+  declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+  declare <32 x i32> @llvm.hexagon.V6.vsubw.128B(<32 x i32>, <32 x i32>)
+
+  attributes #0 = { "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+...
+---
+name:            sqrt_approx
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %0:intregs = COPY $r2
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r0
+    %3:predregs = C2_cmpeqi %2, 0
+    J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.loop.preheader:
+    successors: %bb.3(0x80000000)
+
+    %4:intregs = A2_tfrsi 1056964608
+    %5:hvxvr = V6_lvsplatw killed %4
+    %6:intregs = A2_tfrsi 1065353216
+    %7:hvxvr = V6_lvsplatw killed %6
+    %8:intregs = A2_tfrsi 1073741824
+    %9:hvxvr = V6_lvsplatw killed %8
+    %10:intregs = A2_addi %2, 31
+    %11:intregs = S2_lsr_i_r %10, 5
+    %12:intregs = COPY %11
+    J2_loop0r %bb.3, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.2.loop.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+
+  bb.3.loop.body (machine-block-address-taken):
+    successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+    %13:intregs = PHI %1, %bb.1, %14, %bb.3
+    %15:intregs = PHI %0, %bb.1, %16, %bb.3
+    %17:hvxvr, %14:intregs = V6_vL32b_pi %13, 128 :: (load (s1024) from %ir.lsr.iv1)
+    %18:hvxvr = V6_vaddw %7, %17
+    %19:hvxvr = V6_vmpyowh_rnd killed %18, %5
+    %20:hvxvr = V6_vmpyowh_rnd %19, %5
+    %21:hvxvr = V6_vsubw %9, killed %20
+    %22:hvxvr = V6_vmpyowh_rnd %19, killed %21
+    %23:hvxvr = V6_vmpyowh_rnd %19, %22
+    %24:hvxvr = V6_vsubw %9, killed %23
+    %25:hvxvr = V6_vmpyowh_rnd %22, killed %24
+    %26:hvxvr = V6_vmpyowh_rnd %17, %25
+    %27:hvxvr = V6_vaddw %25, killed %26
+    %28:hvxvr = V6_vmpyowh_rnd killed %27, %5
+    %29:hvxvr = V6_vmpyowh_rnd %28, %5
+    %30:hvxvr = V6_vsubw %9, killed %29
+    %31:hvxvr = V6_vmpyowh_rnd %28, killed %30
+    %32:hvxvr = V6_vmpyowh_rnd %28, %31
+    %33:hvxvr = V6_vsubw %9, killed %32
+    %34:hvxvr = V6_vmpyowh_rnd %31, killed %33
+    %35:hvxvr = V6_vmpyowh_rnd %17, %34
+    %36:hvxvr = V6_vaddw %34, killed %35
+    %37:hvxvr = V6_vmpyowh_rnd killed %36, %5
+    %16:intregs = V6_vS32b_pi %15, 128, killed %37 :: (store (s1024) into %ir.lsr.iv)
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+...