From f6cadae163f3d5d29a12ec11c92aca90a8d742e4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 16 Jun 2023 15:16:38 +1000 Subject: [PATCH 1/4] Streamline some comments. --- compiler/rustc_monomorphize/src/partitioning.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 531644f0b8490..cc4ea16cf50c0 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -175,10 +175,9 @@ where debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); } - // In the next step, we use the inlining map to determine which additional - // monomorphizations have to go into each codegen unit. These additional - // monomorphizations can be drop-glue, functions from external crates, and - // local functions the definition of which is marked with `#[inline]`. + // Use the usage map to put additional mono items in each codegen unit: + // drop-glue, functions from external crates, and local functions the + // definition of which is marked with `#[inline]`. { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_inline_items"); place_inlined_mono_items(cx, &mut codegen_units); @@ -190,8 +189,8 @@ where debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats); } - // Next we try to make as many symbols "internal" as possible, so LLVM has - // more freedom to optimize. + // Make as many symbols "internal" as possible, so LLVM has more freedom to + // optimize. if !tcx.sess.link_dead_code() { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_internalize_symbols"); internalize_symbols(cx, &mut codegen_units, internalization_candidates); From 6f228e3420787f123aff9b06247fefcb91a941ed Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 16 Jun 2023 15:17:01 +1000 Subject: [PATCH 2/4] Inline before merging CGUs. Because CGU merging relies on CGU sizes, but the CGU sizes before inlining aren't accurate. This requires tweaking how the sizes are updated during merging: if CGU A and B both have an inlined function F, then `size(A + B)` will be a little less than `size(A) + size(B)`, because `A + B` will only have one copy of F. Also, the minimum CGU size is increased because it now has to account for inlined functions. This change doesn't have much effect on compile perf, but it makes follow-on changes that involve more sophisticated reasoning about CGU sizes much easier. --- compiler/rustc_middle/src/mir/mono.rs | 4 --- .../rustc_monomorphize/src/partitioning.rs | 26 ++++++++++--------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs index 1511e25523559..1cbf1a36283f5 100644 --- a/compiler/rustc_middle/src/mir/mono.rs +++ b/compiler/rustc_middle/src/mir/mono.rs @@ -335,10 +335,6 @@ impl<'tcx> CodegenUnit<'tcx> { .expect("create_size_estimate must be called before getting a size_estimate") } - pub fn modify_size_estimate(&mut self, delta: usize) { - *self.size_estimate.as_mut().unwrap() += delta; - } - pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool { self.items().contains_key(item) } diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index cc4ea16cf50c0..3be0908fc7f01 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -166,15 +166,6 @@ where placed }; - // Merge until we have at most `max_cgu_count` codegen units. - // `merge_codegen_units` is responsible for updating the CGU size - // estimates. - { - let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus"); - merge_codegen_units(cx, &mut codegen_units); - debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); - } - // Use the usage map to put additional mono items in each codegen unit: // drop-glue, functions from external crates, and local functions the // definition of which is marked with `#[inline]`. @@ -189,6 +180,15 @@ where debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats); } + // Merge until we have at most `max_cgu_count` codegen units. + // `merge_codegen_units` is responsible for updating the CGU size + // estimates. + { + let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus"); + merge_codegen_units(cx, &mut codegen_units); + debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); + } + // Make as many symbols "internal" as possible, so LLVM has more freedom to // optimize. if !tcx.sess.link_dead_code() { @@ -313,7 +313,7 @@ fn merge_codegen_units<'tcx>( // worse generated code. So we don't allow CGUs smaller than this (unless // there is just one CGU, of course). Note that CGU sizes of 100,000+ are // common in larger programs, so this isn't all that large. - const NON_INCR_MIN_CGU_SIZE: usize = 1000; + const NON_INCR_MIN_CGU_SIZE: usize = 1800; // Repeatedly merge the two smallest codegen units as long as: // - we have more CGUs than the upper limit, or @@ -337,9 +337,11 @@ fn merge_codegen_units<'tcx>( let mut smallest = codegen_units.pop().unwrap(); let second_smallest = codegen_units.last_mut().unwrap(); - // Move the mono-items from `smallest` to `second_smallest` - second_smallest.modify_size_estimate(smallest.size_estimate()); + // Move the items from `smallest` to `second_smallest`. Some of them + // may be duplicate inlined items, in which case the destination CGU is + // unaffected. Recalculate size estimates afterwards. second_smallest.items_mut().extend(smallest.items_mut().drain()); + second_smallest.create_size_estimate(cx.tcx); // Record that `second_smallest` now contains all the stuff that was // in `smallest` before. From 105ac1c26d5ea9de84f6b4af583e85eb8991e749 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Jun 2023 16:04:48 +1000 Subject: [PATCH 3/4] Merge root and inlined item placement. There's no longer any need for them to be separate, and putting them together reduces the amount of code. --- .../rustc_monomorphize/src/partitioning.rs | 115 +++++++----------- 1 file changed, 44 insertions(+), 71 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 3be0908fc7f01..4e9a4f91ddf85 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -125,7 +125,7 @@ struct PartitioningCx<'a, 'tcx> { usage_map: &'a UsageMap<'tcx>, } -struct PlacedRootMonoItems<'tcx> { +struct PlacedMonoItems<'tcx> { /// The codegen units, sorted by name to make things deterministic. codegen_units: Vec>, @@ -150,36 +150,20 @@ where let cx = &PartitioningCx { tcx, usage_map }; - // In the first step, we place all regular monomorphizations into their - // respective 'home' codegen unit. Regular monomorphizations are all - // functions and statics defined in the local crate. - let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = { - let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots"); - let mut placed = place_root_mono_items(cx, mono_items); + // Place all mono items into a codegen unit. + let PlacedMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = { + let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_items"); + let mut placed = place_mono_items(cx, mono_items); for cgu in &mut placed.codegen_units { cgu.create_size_estimate(tcx); } - debug_dump(tcx, "ROOTS", &placed.codegen_units, placed.unique_inlined_stats); + debug_dump(tcx, "PLACE", &placed.codegen_units, placed.unique_inlined_stats); placed }; - // Use the usage map to put additional mono items in each codegen unit: - // drop-glue, functions from external crates, and local functions the - // definition of which is marked with `#[inline]`. - { - let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_inline_items"); - place_inlined_mono_items(cx, &mut codegen_units); - - for cgu in &mut codegen_units { - cgu.create_size_estimate(tcx); - } - - debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats); - } - // Merge until we have at most `max_cgu_count` codegen units. // `merge_codegen_units` is responsible for updating the CGU size // estimates. @@ -211,10 +195,7 @@ where codegen_units } -fn place_root_mono_items<'tcx, I>( - cx: &PartitioningCx<'_, 'tcx>, - mono_items: I, -) -> PlacedRootMonoItems<'tcx> +fn place_mono_items<'tcx, I>(cx: &PartitioningCx<'_, 'tcx>, mono_items: I) -> PlacedMonoItems<'tcx> where I: Iterator>, { @@ -235,6 +216,8 @@ where let mut num_unique_inlined_items = 0; let mut unique_inlined_items_size = 0; for mono_item in mono_items { + // Handle only root items directly here. Inlined items are handled at + // the bottom of the loop based on reachability. match mono_item.instantiation_mode(cx.tcx) { InstantiationMode::GloballyShared { .. } => {} InstantiationMode::LocalCopy => { @@ -247,7 +230,7 @@ where let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item); let is_volatile = is_incremental_build && mono_item.is_generic_fn(); - let codegen_unit_name = match characteristic_def_id { + let cgu_name = match characteristic_def_id { Some(def_id) => compute_codegen_unit_name( cx.tcx, cgu_name_builder, @@ -258,9 +241,7 @@ where None => fallback_cgu_name(cgu_name_builder), }; - let codegen_unit = codegen_units - .entry(codegen_unit_name) - .or_insert_with(|| CodegenUnit::new(codegen_unit_name)); + let cgu = codegen_units.entry(cgu_name).or_insert_with(|| CodegenUnit::new(cgu_name)); let mut can_be_internalized = true; let (linkage, visibility) = mono_item_linkage_and_visibility( @@ -273,23 +254,52 @@ where internalization_candidates.insert(mono_item); } - codegen_unit.items_mut().insert(mono_item, (linkage, visibility)); + cgu.items_mut().insert(mono_item, (linkage, visibility)); + + // Get all inlined items that are reachable from `mono_item` without + // going via another root item. This includes drop-glue, functions from + // external crates, and local functions the definition of which is + // marked with `#[inline]`. + let mut reachable_inlined_items = FxHashSet::default(); + get_reachable_inlined_items(cx.tcx, mono_item, cx.usage_map, &mut reachable_inlined_items); + + // Add those inlined items. It's possible an inlined item is reachable + // from multiple root items within a CGU, which is fine, it just means + // the `insert` will be a no-op. + for inlined_item in reachable_inlined_items { + // This is a CGU-private copy. + cgu.items_mut().insert(inlined_item, (Linkage::Internal, Visibility::Default)); + } } // Always ensure we have at least one CGU; otherwise, if we have a // crate with just types (for example), we could wind up with no CGU. if codegen_units.is_empty() { - let codegen_unit_name = fallback_cgu_name(cgu_name_builder); - codegen_units.insert(codegen_unit_name, CodegenUnit::new(codegen_unit_name)); + let cgu_name = fallback_cgu_name(cgu_name_builder); + codegen_units.insert(cgu_name, CodegenUnit::new(cgu_name)); } let mut codegen_units: Vec<_> = codegen_units.into_values().collect(); codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str())); - PlacedRootMonoItems { + return PlacedMonoItems { codegen_units, internalization_candidates, unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size), + }; + + fn get_reachable_inlined_items<'tcx>( + tcx: TyCtxt<'tcx>, + item: MonoItem<'tcx>, + usage_map: &UsageMap<'tcx>, + visited: &mut FxHashSet>, + ) { + usage_map.for_each_inlined_used_item(tcx, item, |inlined_item| { + let is_new = visited.insert(inlined_item); + if is_new { + get_reachable_inlined_items(tcx, inlined_item, usage_map, visited); + } + }); } } @@ -407,43 +417,6 @@ fn merge_codegen_units<'tcx>( codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str())); } -fn place_inlined_mono_items<'tcx>( - cx: &PartitioningCx<'_, 'tcx>, - codegen_units: &mut [CodegenUnit<'tcx>], -) { - for cgu in codegen_units.iter_mut() { - // Collect all inlined items that need to be available in this codegen unit. - let mut reachable_inlined_items = FxHashSet::default(); - for root in cgu.items().keys() { - // Get all inlined items that are reachable from it without going - // via another root item. - get_reachable_inlined_items(cx.tcx, *root, cx.usage_map, &mut reachable_inlined_items); - } - - // Add all monomorphizations that are not already there. - for inlined_item in reachable_inlined_items { - assert!(!cgu.items().contains_key(&inlined_item)); - - // This is a CGU-private copy. - cgu.items_mut().insert(inlined_item, (Linkage::Internal, Visibility::Default)); - } - } - - fn get_reachable_inlined_items<'tcx>( - tcx: TyCtxt<'tcx>, - item: MonoItem<'tcx>, - usage_map: &UsageMap<'tcx>, - visited: &mut FxHashSet>, - ) { - usage_map.for_each_inlined_used_item(tcx, item, |inlined_item| { - let is_new = visited.insert(inlined_item); - if is_new { - get_reachable_inlined_items(tcx, inlined_item, usage_map, visited); - } - }); - } -} - fn internalize_symbols<'tcx>( cx: &PartitioningCx<'_, 'tcx>, codegen_units: &mut [CodegenUnit<'tcx>], From abde9ba527704449ae84848673b52a8ab0f5cbd1 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Jun 2023 16:38:47 +1000 Subject: [PATCH 4/4] Tweak CGU size estimate code. - Rename `create_size_estimate` as `compute_size_estimate`, because that makes more sense for the second and subsequent calls for each CGU. - Change `CodegenUnit::size_estimate` from `Option` to `usize`. We can still assert that `compute_size_estimate` is called first. - Move the size estimation for `place_mono_items` inside the function, for consistency with `merge_codegen_units`. --- compiler/rustc_middle/src/mir/mono.rs | 16 +++++++++------- compiler/rustc_monomorphize/src/partitioning.rs | 15 ++++++++------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs index 1cbf1a36283f5..ca735d5231465 100644 --- a/compiler/rustc_middle/src/mir/mono.rs +++ b/compiler/rustc_middle/src/mir/mono.rs @@ -231,7 +231,7 @@ pub struct CodegenUnit<'tcx> { /// as well as the crate name and disambiguator. name: Symbol, items: FxHashMap, (Linkage, Visibility)>, - size_estimate: Option, + size_estimate: usize, primary: bool, /// True if this is CGU is used to hold code coverage information for dead code, /// false otherwise. @@ -269,7 +269,7 @@ impl<'tcx> CodegenUnit<'tcx> { CodegenUnit { name, items: Default::default(), - size_estimate: None, + size_estimate: 0, primary: false, is_code_coverage_dead_code_cgu: false, } @@ -320,19 +320,21 @@ impl<'tcx> CodegenUnit<'tcx> { base_n::encode(hash, base_n::CASE_INSENSITIVE) } - pub fn create_size_estimate(&mut self, tcx: TyCtxt<'tcx>) { + pub fn compute_size_estimate(&mut self, tcx: TyCtxt<'tcx>) { // Estimate the size of a codegen unit as (approximately) the number of MIR // statements it corresponds to. - self.size_estimate = Some(self.items.keys().map(|mi| mi.size_estimate(tcx)).sum()); + self.size_estimate = self.items.keys().map(|mi| mi.size_estimate(tcx)).sum(); } #[inline] - /// Should only be called if [`create_size_estimate`] has previously been called. + /// Should only be called if [`compute_size_estimate`] has previously been called. /// - /// [`create_size_estimate`]: Self::create_size_estimate + /// [`compute_size_estimate`]: Self::compute_size_estimate pub fn size_estimate(&self) -> usize { + // Items are never zero-sized, so if we have items the estimate must be + // non-zero, unless we forgot to call `compute_size_estimate` first. + assert!(self.items.is_empty() || self.size_estimate != 0); self.size_estimate - .expect("create_size_estimate must be called before getting a size_estimate") } pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool { diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 4e9a4f91ddf85..97e3748b8b826 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -150,14 +150,11 @@ where let cx = &PartitioningCx { tcx, usage_map }; - // Place all mono items into a codegen unit. + // Place all mono items into a codegen unit. `place_mono_items` is + // responsible for initializing the CGU size estimates. let PlacedMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_items"); - let mut placed = place_mono_items(cx, mono_items); - - for cgu in &mut placed.codegen_units { - cgu.create_size_estimate(tcx); - } + let placed = place_mono_items(cx, mono_items); debug_dump(tcx, "PLACE", &placed.codegen_units, placed.unique_inlined_stats); @@ -282,6 +279,10 @@ where let mut codegen_units: Vec<_> = codegen_units.into_values().collect(); codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str())); + for cgu in codegen_units.iter_mut() { + cgu.compute_size_estimate(cx.tcx); + } + return PlacedMonoItems { codegen_units, internalization_candidates, @@ -351,7 +352,7 @@ fn merge_codegen_units<'tcx>( // may be duplicate inlined items, in which case the destination CGU is // unaffected. Recalculate size estimates afterwards. second_smallest.items_mut().extend(smallest.items_mut().drain()); - second_smallest.create_size_estimate(cx.tcx); + second_smallest.compute_size_estimate(cx.tcx); // Record that `second_smallest` now contains all the stuff that was // in `smallest` before.