Skip to content

[BOLT] Add pre-aggregated trace support #127125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions bolt/include/bolt/Profile/DataAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class DataAggregator : public DataReader {

/// Used for parsing specific pre-aggregated input files.
struct AggregatedLBREntry {
enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE };
Location From;
Location To;
uint64_t Count;
Expand Down Expand Up @@ -197,6 +197,10 @@ class DataAggregator : public DataReader {

BoltAddressTranslation *BAT{nullptr};

/// Whether pre-aggregated profile needs to convert branch profile into call
/// to continuation fallthrough profile.
bool NeedsConvertRetProfileToCallCont{false};

/// Update function execution profile with a recorded trace.
/// A trace is region of code executed between two LBR entries supplied in
/// execution order.
Expand Down Expand Up @@ -268,8 +272,7 @@ class DataAggregator : public DataReader {
uint64_t Mispreds);

/// Register a \p Branch.
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
bool IsPreagg);
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);

/// Register a trace between two LBR entries supplied in execution order.
bool doTrace(const LBREntry &First, const LBREntry &Second,
Expand Down Expand Up @@ -298,7 +301,7 @@ class DataAggregator : public DataReader {
ErrorOr<PerfMemSample> parseMemSample();

/// Parse pre-aggregated LBR samples created by an external tool
ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
std::error_code parseAggregatedLBREntry();

/// Parse either buildid:offset or just offset, representing a location in the
/// binary. Used exclusively for pre-aggregated LBR samples.
Expand Down Expand Up @@ -384,14 +387,15 @@ class DataAggregator : public DataReader {
/// memory.
///
/// File format syntax:
/// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
/// [<mispred_count>]
/// {B|F|f|T} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
/// <count> [<mispred_count>]
///
/// B - indicates an aggregated branch
/// F - an aggregated fall-through
/// f - an aggregated fall-through with external origin - used to disambiguate
/// between a return hitting a basic block head and a regular internal
/// jump to the block
/// T - an aggregated trace: branch with a fall-through (from, to, ft_end)
///
/// <start_id> - build id of the object containing the start address. We can
/// skip it for the main binary and use "X" for an unknown object. This will
Expand All @@ -402,6 +406,8 @@ class DataAggregator : public DataReader {
///
/// <end_id>, <end_offset> - same for the end address.
///
/// <ft_end> - same for the fallthrough_end address.
///
/// <count> - total aggregated count of the branch or a fall-through.
///
/// <mispred_count> - the number of times the branch was mispredicted.
Expand Down
66 changes: 45 additions & 21 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
}

bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
uint64_t Mispreds, bool IsPreagg) {
uint64_t Mispreds) {
// Returns whether \p Offset in \p Func contains a return instruction.
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
Expand Down Expand Up @@ -772,7 +772,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
return false;

// Record call to continuation trace.
if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
(IsReturn || IsCallCont)) {
LBREntry First{ToOrig - 1, ToOrig - 1, false};
LBREntry Second{ToOrig, ToOrig, false};
return doTrace(First, Second, Count);
Expand Down Expand Up @@ -1216,23 +1217,30 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
return Location(true, BuildID.get(), Offset.get());
}

ErrorOr<DataAggregator::AggregatedLBREntry>
DataAggregator::parseAggregatedLBREntry() {
std::error_code DataAggregator::parseAggregatedLBREntry() {
while (checkAndConsumeFS()) {
}

ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
if (std::error_code EC = TypeOrErr.getError())
return EC;
// Pre-aggregated profile with branches and fallthroughs needs to convert
// return profile into call to continuation fall-through.
auto Type = AggregatedLBREntry::BRANCH;
if (TypeOrErr.get() == "B") {
NeedsConvertRetProfileToCallCont = true;
Type = AggregatedLBREntry::BRANCH;
} else if (TypeOrErr.get() == "F") {
NeedsConvertRetProfileToCallCont = true;
Type = AggregatedLBREntry::FT;
} else if (TypeOrErr.get() == "f") {
NeedsConvertRetProfileToCallCont = true;
Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
} else if (TypeOrErr.get() == "T") {
// Trace is expanded into B and [Ff]
Type = AggregatedLBREntry::TRACE;
} else {
reportError("expected B, F or f");
reportError("expected T, B, F or f");
return make_error_code(llvm::errc::io_error);
}

Expand All @@ -1248,6 +1256,15 @@ DataAggregator::parseAggregatedLBREntry() {
if (std::error_code EC = To.getError())
return EC;

ErrorOr<Location> TraceFtEnd = std::error_code();
if (Type == AggregatedLBREntry::TRACE) {
while (checkAndConsumeFS()) {
}
TraceFtEnd = parseLocationOrOffset();
if (std::error_code EC = TraceFtEnd.getError())
return EC;
}

while (checkAndConsumeFS()) {
}
ErrorOr<int64_t> Frequency =
Expand All @@ -1270,9 +1287,24 @@ DataAggregator::parseAggregatedLBREntry() {
return make_error_code(llvm::errc::io_error);
}

return AggregatedLBREntry{From.get(), To.get(),
static_cast<uint64_t>(Frequency.get()), Mispreds,
Type};
BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From->Offset);
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To->Offset);

for (BinaryFunction *BF : {FromFunc, ToFunc})
if (BF)
BF->setHasProfileAvailable();

uint64_t Count = static_cast<uint64_t>(Frequency.get());
AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type};
AggregatedLBRs.emplace_back(Entry);
if (Type == AggregatedLBREntry::TRACE) {
auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT
: AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType};
AggregatedLBRs.emplace_back(TraceFt);
}

return std::error_code();
}

bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
Expand Down Expand Up @@ -1585,8 +1617,7 @@ void DataAggregator::processBranchEvents() {
for (const auto &AggrLBR : BranchLBRs) {
const Trace &Loc = AggrLBR.first;
const TakenBranchInfo &Info = AggrLBR.second;
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
/*IsPreagg*/ false);
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
}
}

Expand Down Expand Up @@ -1722,18 +1753,10 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
while (hasData()) {
ErrorOr<AggregatedLBREntry> AggrEntry = parseAggregatedLBREntry();
if (std::error_code EC = AggrEntry.getError())
while (hasData())
if (std::error_code EC = parseAggregatedLBREntry())
return EC;

for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset})
if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
BF->setHasProfileAvailable();

AggregatedLBRs.emplace_back(std::move(AggrEntry.get()));
}

return std::error_code();
}

Expand All @@ -1746,8 +1769,9 @@ void DataAggregator::processPreAggregated() {
for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) {
switch (AggrEntry.EntryType) {
case AggregatedLBREntry::BRANCH:
case AggregatedLBREntry::TRACE:
doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
AggrEntry.Mispreds, /*IsPreagg*/ true);
AggrEntry.Mispreds);
break;
case AggregatedLBREntry::FT:
case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {
Expand Down
48 changes: 31 additions & 17 deletions bolt/test/X86/callcont-fallthru.s
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,43 @@
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
## Link against a DSO to ensure PLT entries.
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
# RUN: link_fdata %s %t %t.pa1 PREAGG
# RUN: link_fdata %s %t %t.pa1 PREAGG1
# RUN: link_fdata %s %t %t.pa2 PREAGG2
# RUN: link_fdata %s %t %t.pa3 PREAGG3
# RUN: link_fdata %s %t %t.pa4 PREAGG4
# RUN: link_fdata %s %t %t.pat PREAGGT1
# RUN: link_fdata %s %t %t.pat2 PREAGGT2

## Check normal case: fallthrough is not LP or secondary entry.
# RUN: llvm-strip --strip-unneeded %t -o %t.exe
# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s

## Check that getFallthroughsInTrace correctly handles a trace starting at plt
## call continuation
# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2

## Check that we don't treat secondary entry points as call continuation sites.
# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3

## Check fallthrough to a landing pad case.
# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3

## Check pre-aggregated traces attach call continuation fallthrough count
# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s

## Check pre-aggregated traces don't attach call continuation fallthrough count
## to secondary entry point (unstripped)
# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
## Check pre-aggregated traces don't attach call continuation fallthrough count
## to landing pad (stripped, LP)
# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3

.globl foo
.type foo, %function
Expand All @@ -51,8 +66,9 @@ main:
movl %edi, -0x8(%rbp)
movq %rsi, -0x10(%rbp)
callq puts@PLT
## Target is a call continuation
# PREAGG: B X:0 #Ltmp1# 2 0
## Target is an external-origin call continuation
# PREAGG1: B X:0 #Ltmp1# 2 0
# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
# CHECK: callq puts@PLT
# CHECK-NEXT: count: 2

Expand All @@ -63,14 +79,16 @@ Ltmp1:

Ltmp4:
cmpl $0x0, -0x14(%rbp)
Ltmp4_br:
je Ltmp0
# CHECK2: je .Ltmp0
# CHECK2-NEXT: count: 3

movl $0xa, -0x18(%rbp)
callq foo
## Target is a call continuation
# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
## Target is a binary-local call continuation
# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
# CHECK: callq foo
# CHECK-NEXT: count: 1

Expand All @@ -79,16 +97,12 @@ Ltmp4:
# CHECK2: callq foo
# CHECK2-NEXT: count: 3

## Target is a secondary entry point
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
# PREAGG3: B X:0 #Ltmp3# 2 0
# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
# CHECK3: callq foo
# CHECK3-NEXT: count: 0

## Target is a landing pad
# PREAGG4: B X:0 #Ltmp3# 2 0
# CHECK4: callq puts@PLT
# CHECK4-NEXT: count: 0

Ltmp3:
cmpl $0x0, -0x18(%rbp)
Ltmp3_br:
Expand Down
6 changes: 3 additions & 3 deletions bolt/test/link_fdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
fdata_pat = re.compile(r"([01].*) (?P<exec>\d+) (?P<mispred>\d+)")

# Pre-aggregated profile:
# {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
# [<mispred_count>]
preagg_pat = re.compile(r"(?P<type>[BFf]) (?P<offsets_count>.*)")
# {T|B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
# <count> [<mispred_count>]
preagg_pat = re.compile(r"(?P<type>[TBFf]) (?P<offsets_count>.*)")

# No-LBR profile:
# <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
Expand Down