diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index d1dced9ef28dc..bdad63f368dfe 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -2906,7 +2906,7 @@ void MachineBlockPlacement::buildCFGChains() { void MachineBlockPlacement::optimizeBranches() { BlockChain &FunctionChain = *BlockToChain[&F->front()]; - SmallVector Cond; // For analyzeBranch. + SmallVector Cond; // Now that all the basic blocks in the chain have the proper layout, // make a final call to analyzeBranch with AllowModify set. @@ -2916,24 +2916,30 @@ void MachineBlockPlacement::optimizeBranches() { // a fallthrough when it occurs after predicated terminators. for (MachineBasicBlock *ChainBB : FunctionChain) { Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. - if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) { - // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher probability first. - if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeProbability(ChainBB, FBB) > - MBPI->getEdgeProbability(ChainBB, TBB) && - !TII->reverseBranchCondition(Cond)) { - LLVM_DEBUG(dbgs() << "Reverse order of the two branches: " - << getBlockName(ChainBB) << "\n"); - LLVM_DEBUG(dbgs() << " Edge probability: " - << MBPI->getEdgeProbability(ChainBB, FBB) << " vs " - << MBPI->getEdgeProbability(ChainBB, TBB) << "\n"); - DebugLoc dl; // FIXME: this is nowhere - TII->removeBranch(*ChainBB); - TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl); - } - } + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) + continue; + if (!TBB || !FBB || Cond.empty()) + continue; + // If we are optimizing for size we do not consider the runtime performance. + // Instead, we retain the original branch condition so we have more uniform + // instructions which will benefit ICF. + if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get())) + continue; + // If ChainBB has a two-way branch, try to re-order the branches + // such that we branch to the successor with higher probability first. + if (MBPI->getEdgeProbability(ChainBB, TBB) >= + MBPI->getEdgeProbability(ChainBB, FBB)) + continue; + if (TII->reverseBranchCondition(Cond)) + continue; + LLVM_DEBUG(dbgs() << "Reverse order of the two branches: " + << getBlockName(ChainBB) << "\n"); + LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB) + << "\n"); + auto Dl = ChainBB->findBranchDebugLoc(); + TII->removeBranch(*ChainBB); + TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl); } } diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll new file mode 100644 index 0000000000000..3645718968f9e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +; When consuming profile data we sometimes flip a branch to improve runtime +; performance. If we are optimizing for size, we avoid changing the branch to +; improve outlining and ICF. + +define i8 @foo_optsize(i32 %v4) optsize { +; CHECK-LABEL: foo_optsize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz wzr, .LBB0_2 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %b1 +; CHECK-NEXT: cbnz w0, .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %b2 +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_4: // %b1 +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.5: // %b3 +; CHECK-NEXT: cbz wzr, .LBB0_1 +; CHECK-NEXT: b .LBB0_3 +entry: + %v2 = icmp eq i32 0, 0 + br i1 %v2, label %b1, label %b4 + +b1: + switch i32 %v4, label %b4 [ + i32 1, label %b3 + i32 0, label %b2 + ], !prof !0 + +b2: + br label %b4 + +b3: + %v3 = icmp eq i32 0, 0 + br i1 %v3, label %b4, label %b2 + +b4: + %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ] + ret i8 %v16 +} + +define i8 @foo_optspeed(i32 %v4) { +; CHECK-LABEL: foo_optspeed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz wzr, .LBB1_2 +; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %b1 +; CHECK-NEXT: cbnz w0, .LBB1_4 +; CHECK-NEXT: .LBB1_3: // %b2 +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_4: // %b1 +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.5: // %b3 +; CHECK-NEXT: cbnz wzr, .LBB1_3 +; CHECK-NEXT: b .LBB1_1 +entry: + %v2 = icmp eq i32 0, 0 + br i1 %v2, label %b1, label %b4 + +b1: + switch i32 %v4, label %b4 [ + i32 1, label %b3 + i32 0, label %b2 + ], !prof !0 + +b2: + br label %b4 + +b3: + %v3 = icmp eq i32 0, 0 + br i1 %v3, label %b4, label %b2 + +b4: + %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ] + ret i8 %v16 +} + +!0 = !{!"branch_weights", i32 5, i32 5, i32 100} diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll index 88a132d3850d1..9e0a19f9a504f 100644 --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -303,10 +303,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; CHECK32-NEXT: .LBB3_8: # %if.else ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3] -; CHECK32-NEXT: jb .LBB3_11 # encoding: [0x72,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 -; CHECK32-NEXT: jmp .LBB3_9 # encoding: [0xeb,A] +; CHECK32-NEXT: jae .LBB3_9 # encoding: [0x73,A] ; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 +; CHECK32-NEXT: jmp .LBB3_11 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK32-NEXT: .LBB3_12: # %sw.bb22 ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18] @@ -483,10 +483,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; WIN64-NEXT: # %bb.6: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d] -; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] -; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 -; WIN64-NEXT: jmp .LBB3_8 # encoding: [0xeb,A] +; WIN64-NEXT: jne .LBB3_8 # encoding: [0x75,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1 +; WIN64-NEXT: jmp .LBB3_10 # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: .LBB3_7: # %sw.bb14 ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]