From a57ea71327c2cb4baaab9fbdb67b17922e837794 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Mon, 23 Sep 2024 13:02:43 -0700 Subject: [PATCH 1/3] [CodeLayout] Size-aware machine block placement --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 114 ++++++++++----- .../X86/code_placement_ext_tsp_size.ll | 131 ++++++++++++++++++ 2 files changed, 212 insertions(+), 33 deletions(-) create mode 100644 llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 7807875c06584..c468a662c007f 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -218,6 +218,11 @@ static cl::opt ExtTspBlockPlacementMaxBlocks( "block placement."), cl::init(UINT_MAX), cl::Hidden); +// Apply the ext-tsp algorithm minimizing the size of a binary. +static cl::opt + ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, + cl::desc("Use ext-tsp for size-aware block placement.")); + namespace llvm { extern cl::opt EnableExtTspBlockPlacement; extern cl::opt ApplyExtTspWithoutProfile; @@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass { void precomputeTriangleChains(); /// Apply a post-processing step optimizing block placement. - void applyExtTsp(); + void applyExtTsp(bool OptForSize); /// Modify the existing block placement in the function and adjust all jumps. void assignBlockOrder(const std::vector &NewOrder); @@ -3505,20 +3510,29 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { // Initialize tail duplication thresholds. initTailDupThreshold(); + const bool OptForSize = + MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); + // Use ext-tsp for size optimization is possible only when the function + // contains more than two basic blocks. + const bool UseExtTspForSize = + OptForSize && ApplyExtTspForSize && MF.size() >= 3; + // Apply tail duplication. if (allowTailDupPlacement()) { MPDT = &getAnalysis().getPostDomTree(); - bool OptForSize = MF.getFunction().hasOptSize() || - llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); if (OptForSize) TailDupSize = 1; const bool PreRegAlloc = false; TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI, /* LayoutMode */ true, TailDupSize); - precomputeTriangleChains(); + if (!UseExtTspForSize) + precomputeTriangleChains(); } - buildCFGChains(); + // Run the main block placement. + if (!UseExtTspForSize) + buildCFGChains(); // Changing the layout can create new tail merging opportunities. // TailMerge can create jump into if branches that make CFG irreducible for @@ -3545,15 +3559,19 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { } } - // Apply a post-processing optimizing block placement. - if (MF.size() >= 3 && EnableExtTspBlockPlacement && - (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) && - MF.size() <= ExtTspBlockPlacementMaxBlocks) { - // Find a new placement and modify the layout of the blocks in the function. - applyExtTsp(); - - // Re-create CFG chain so that we can optimizeBranches and alignBlocks. - createCFGChainExtTsp(); + // Apply a post-processing optimizing block placement: + // - find a new placement and modify the layout of the blocks in the function; + // - re-create CFG chains so that we can optimizeBranches and alignBlocks. + if (MF.size() >= 3) { + if (EnableExtTspBlockPlacement && + (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) && + MF.size() <= ExtTspBlockPlacementMaxBlocks) { + applyExtTsp(false); + createCFGChainExtTsp(); + } else if (UseExtTspForSize) { + applyExtTsp(true); + createCFGChainExtTsp(); + } } optimizeBranches(); @@ -3577,7 +3595,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { return true; } -void MachineBlockPlacement::applyExtTsp() { +void MachineBlockPlacement::applyExtTsp(bool OptForSize) { // Prepare data; blocks are indexed by their index in the current ordering. DenseMap BlockIndex; BlockIndex.reserve(F->size()); @@ -3589,13 +3607,15 @@ void MachineBlockPlacement::applyExtTsp() { CurrentBlockOrder.push_back(&MBB); } - auto BlockSizes = std::vector(F->size()); - auto BlockCounts = std::vector(F->size()); + std::vector BlockCounts(F->size()); + std::vector BlockSizes(F->size()); std::vector JumpCounts; + SmallVector Cond; // For analyzeBranch. + SmallVector Succs; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); - BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency(); + BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency(); // Getting the block size: // - approximate the size of an instruction by 4 bytes, and // - ignore debug instructions. @@ -3604,23 +3624,48 @@ void MachineBlockPlacement::applyExtTsp() { // not see a perf improvement with the exact block sizes. auto NonDbgInsts = instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); - int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); + size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts; // Getting jump frequencies. - for (MachineBasicBlock *Succ : MBB.successors()) { - auto EP = MBPI->getEdgeProbability(&MBB, Succ); - BlockFrequency JumpFreq = BlockFreq * EP; - JumpCounts.push_back( - {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); + + if (!OptForSize) { + for (MachineBasicBlock *Succ : MBB.successors()) { + auto EP = MBPI->getEdgeProbability(&MBB, Succ); + BlockFrequency JumpFreq = BlockFreq * EP; + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); + } + } else { + Cond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. + if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) + continue; + + const MachineBasicBlock *FTB = MBB.getFallThrough(); + + Succs.clear(); + if (TBB && TBB != FTB) + Succs.push_back(TBB); + if (FBB && FBB != FTB) + Succs.push_back(FBB); + if (FTB) + Succs.push_back(FTB); + // Absolute magnitude of non-zero counts does not matter for the + // optimization; prioritize slightly jumps with a single successor, since + // the corresponding jump instruction will be removed from the binary. + const uint64_t Freq = Succs.size() == 1 ? 110 : 100; + for (const MachineBasicBlock *Succ : Succs) { + JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq}); + } } } LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size() << " with profile = " << F->getFunction().hasProfileData() - << " (" << F->getName().str() << ")" - << "\n"); - LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", - calcExtTspScore(BlockSizes, JumpCounts))); + << " (" << F->getName().str() << ")" << "\n"); + + const double OrgScore = calcExtTspScore(BlockSizes, BlockCounts, JumpCounts); + LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore)); // Run the layout algorithm. auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); @@ -3629,12 +3674,15 @@ void MachineBlockPlacement::applyExtTsp() { for (uint64_t Node : NewOrder) { NewBlockOrder.push_back(CurrentBlockOrder[Node]); } - LLVM_DEBUG( - dbgs() << format(" optimized layout score: %0.2f\n", - calcExtTspScore(NewOrder, BlockSizes, JumpCounts))); + const double OptScore = + calcExtTspScore(NewOrder, BlockSizes, BlockCounts, JumpCounts); + LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore)); - // Assign new block order. - assignBlockOrder(NewBlockOrder); + // If the optimization is unsuccessful, fall back to the original block order. + if (OptForSize && OrgScore > OptScore) + assignBlockOrder(CurrentBlockOrder); + else + assignBlockOrder(NewBlockOrder); } void MachineBlockPlacement::assignBlockOrder( diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll new file mode 100644 index 0000000000000..a0f9a8af3cc6c --- /dev/null +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll @@ -0,0 +1,131 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK2 + +define void @func1() minsize { +; +; +-----+ +; | b0 | -+ +; +-----+ | +; | | +; | 10 | +; v | +; +-----+ | +; | b1 | | 10000 +; +-----+ | +; | | +; | 10 | +; v | +; +-----+ | +; | b2 | <+ +; +-----+ +; +; CHECK-LABEL: func1: +; CHECK: %b0 +; CHECK: %b1 +; CHECK: %b2 +; +; CHECK2-LABEL: func1: +; CHECK2: %b0 +; CHECK2: %b2 +; CHECK2: %b1 + +b0: + %call = call zeroext i1 @a() + br i1 %call, label %b1, label %b2, !prof !1 + +b1: + call void @d() + call void @d() + call void @d() + br label %b2 + +b2: + call void @e() + ret void +} + +define void @func_loop() minsize !prof !9 { +; Test that the algorithm can rotate loops in the presence of profile data. +; +; +--------+ +; | entry | +; +--------+ +; | +; | 1 +; v +; +--------+ 16 +--------+ +; | if.then| <---- | header | <+ +; +--------+ +--------+ | +; | | | +; | | 160 | +; | v | +; | +--------+ | +; | | if.else| | 175 +; | +--------+ | +; | | | +; | | 160 | +; | v | +; | 16 +--------+ | +; +------------> | if.end | -+ +; +--------+ +; | +; | 1 +; v +; +--------+ +; | end | +; +--------+ +; +; CHECK-LABEL: func_loop: +; CHECK: %entry +; CHECK: %header +; CHECK: %if.then +; CHECK: %if.else +; CHECK: %if.end +; CHECK: %end +; +; CHECK2-LABEL: func_loop: +; CHECK2: %entry +; CHECK2: %header +; CHECK2: %if.else +; CHECK2: %if.end +; CHECK2: %if.then +; CHECK2: %end + +entry: + br label %header + +header: + call void @e() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !10 + +if.then: + call void @f() + br label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end + +end: + ret void +} + + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @g() +declare void @f() +declare void @h() + +!1 = !{!"branch_weights", i32 10, i32 10000} +!9 = !{!"function_entry_count", i64 1} +!10 = !{!"branch_weights", i32 16, i32 160} From 8666e12fd5e9617214de3eb65ddfb3a66586bc83 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Sat, 28 Sep 2024 15:34:38 -0700 Subject: [PATCH 2/3] addressing comments --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 40 ++++++++-------- .../X86/code_placement_ext_tsp_size.ll | 48 +++++++++---------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index c468a662c007f..bc4232eb892a3 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3566,10 +3566,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (EnableExtTspBlockPlacement && (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) && MF.size() <= ExtTspBlockPlacementMaxBlocks) { - applyExtTsp(false); + applyExtTsp(/*OptForSize=*/false); createCFGChainExtTsp(); } else if (UseExtTspForSize) { - applyExtTsp(true); + applyExtTsp(/*OptForSize=*/true); createCFGChainExtTsp(); } } @@ -3607,9 +3607,9 @@ void MachineBlockPlacement::applyExtTsp(bool OptForSize) { CurrentBlockOrder.push_back(&MBB); } - std::vector BlockCounts(F->size()); - std::vector BlockSizes(F->size()); - std::vector JumpCounts; + SmallVector BlockCounts(F->size()); + SmallVector BlockSizes(F->size()); + SmallVector JumpCounts; SmallVector Cond; // For analyzeBranch. SmallVector Succs; for (MachineBasicBlock &MBB : *F) { @@ -3626,23 +3626,18 @@ void MachineBlockPlacement::applyExtTsp(bool OptForSize) { instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts; - // Getting jump frequencies. - if (!OptForSize) { - for (MachineBasicBlock *Succ : MBB.successors()) { - auto EP = MBPI->getEdgeProbability(&MBB, Succ); - BlockFrequency JumpFreq = BlockFreq * EP; - JumpCounts.push_back( - {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); - } - } else { + // Getting jump frequencies. + if (OptForSize) { Cond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) continue; const MachineBasicBlock *FTB = MBB.getFallThrough(); - + // Succs is a collection of distinct destinations of the block reachable + // from MBB via a jump instruction; initialize the list using the three + // (non-necessarily distinct) blocks, FTB, TBB, and FBB. Succs.clear(); if (TBB && TBB != FTB) Succs.push_back(TBB); @@ -3654,17 +3649,23 @@ void MachineBlockPlacement::applyExtTsp(bool OptForSize) { // optimization; prioritize slightly jumps with a single successor, since // the corresponding jump instruction will be removed from the binary. const uint64_t Freq = Succs.size() == 1 ? 110 : 100; - for (const MachineBasicBlock *Succ : Succs) { + for (const MachineBasicBlock *Succ : Succs) JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq}); + } else { + for (MachineBasicBlock *Succ : MBB.successors()) { + auto EP = MBPI->getEdgeProbability(&MBB, Succ); + BlockFrequency JumpFreq = BlockFreq * EP; + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); } } } LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size() << " with profile = " << F->getFunction().hasProfileData() - << " (" << F->getName().str() << ")" << "\n"); + << " (" << F->getName() << ")" << "\n"); - const double OrgScore = calcExtTspScore(BlockSizes, BlockCounts, JumpCounts); + const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts); LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore)); // Run the layout algorithm. @@ -3674,8 +3675,7 @@ void MachineBlockPlacement::applyExtTsp(bool OptForSize) { for (uint64_t Node : NewOrder) { NewBlockOrder.push_back(CurrentBlockOrder[Node]); } - const double OptScore = - calcExtTspScore(NewOrder, BlockSizes, BlockCounts, JumpCounts); + const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts); LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore)); // If the optimization is unsuccessful, fall back to the original block order. diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll index a0f9a8af3cc6c..59eaf2586f173 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll @@ -1,5 +1,5 @@ -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK2 +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s -check-prefix=CHECK-PERF +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE define void @func1() minsize { ; @@ -19,15 +19,15 @@ define void @func1() minsize { ; | b2 | <+ ; +-----+ ; -; CHECK-LABEL: func1: -; CHECK: %b0 -; CHECK: %b1 -; CHECK: %b2 +; CHECK-PERF-LABEL: func1: +; CHECK-PERF: %b0 +; CHECK-PERF: %b1 +; CHECK-PERF: %b2 ; -; CHECK2-LABEL: func1: -; CHECK2: %b0 -; CHECK2: %b2 -; CHECK2: %b1 +; CHECK-SIZE-LABEL: func1: +; CHECK-SIZE: %b0 +; CHECK-SIZE: %b2 +; CHECK-SIZE: %b1 b0: %call = call zeroext i1 @a() @@ -75,21 +75,21 @@ define void @func_loop() minsize !prof !9 { ; | end | ; +--------+ ; -; CHECK-LABEL: func_loop: -; CHECK: %entry -; CHECK: %header -; CHECK: %if.then -; CHECK: %if.else -; CHECK: %if.end -; CHECK: %end +; CHECK-PERF-LABEL: func_loop: +; CHECK-PERF: %entry +; CHECK-PERF: %header +; CHECK-PERF: %if.then +; CHECK-PERF: %if.else +; CHECK-PERF: %if.end +; CHECK-PERF: %end ; -; CHECK2-LABEL: func_loop: -; CHECK2: %entry -; CHECK2: %header -; CHECK2: %if.else -; CHECK2: %if.end -; CHECK2: %if.then -; CHECK2: %end +; CHECK-SIZE-LABEL: func_loop: +; CHECK-SIZE: %entry +; CHECK-SIZE: %header +; CHECK-SIZE: %if.else +; CHECK-SIZE: %if.end +; CHECK-SIZE: %if.then +; CHECK-SIZE: %end entry: br label %header From 0b9b2210dc985dae24c3abde8d25a94acf3ccb78 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 1 Oct 2024 13:20:20 -0700 Subject: [PATCH 3/3] adjusting ext-tsp checks as suggested --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 31 ++++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index bc4232eb892a3..c42e63202c3b5 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3513,10 +3513,17 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { const bool OptForSize = MF.getFunction().hasOptSize() || llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); - // Use ext-tsp for size optimization is possible only when the function - // contains more than two basic blocks. - const bool UseExtTspForSize = - OptForSize && ApplyExtTspForSize && MF.size() >= 3; + // Determine whether to use ext-tsp for perf/size optimization. The method + // is beneficial only for instances with at least 3 basic blocks and it can be + // disabled for huge functions (exceeding a certain size). + bool UseExtTspForPerf = false; + bool UseExtTspForSize = false; + if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) { + UseExtTspForPerf = + EnableExtTspBlockPlacement && + (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()); + UseExtTspForSize = OptForSize && ApplyExtTspForSize; + } // Apply tail duplication. if (allowTailDupPlacement()) { @@ -3562,16 +3569,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { // Apply a post-processing optimizing block placement: // - find a new placement and modify the layout of the blocks in the function; // - re-create CFG chains so that we can optimizeBranches and alignBlocks. - if (MF.size() >= 3) { - if (EnableExtTspBlockPlacement && - (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) && - MF.size() <= ExtTspBlockPlacementMaxBlocks) { - applyExtTsp(/*OptForSize=*/false); - createCFGChainExtTsp(); - } else if (UseExtTspForSize) { - applyExtTsp(/*OptForSize=*/true); - createCFGChainExtTsp(); - } + if (UseExtTspForPerf || UseExtTspForSize) { + assert( + !(UseExtTspForPerf && UseExtTspForSize) && + "UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly"); + applyExtTsp(/*OptForSize=*/UseExtTspForSize); + createCFGChainExtTsp(); } optimizeBranches();