From 278a0ae4e3855dfaa4bb43bd5f35ec292f1d6a23 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 10:54:05 +1000 Subject: [PATCH 1/7] XXX: nicer merging code (extend instead of insert) --- compiler/rustc_monomorphize/src/partitioning.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index a74ba8e4a4be9..531644f0b8490 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -340,9 +340,7 @@ fn merge_codegen_units<'tcx>( // Move the mono-items from `smallest` to `second_smallest` second_smallest.modify_size_estimate(smallest.size_estimate()); - for (k, v) in smallest.items_mut().drain() { - second_smallest.items_mut().insert(k, v); - } + second_smallest.items_mut().extend(smallest.items_mut().drain()); // Record that `second_smallest` now contains all the stuff that was // in `smallest` before. From 3e127061dad7c119c6b3ba77d625b33a4f5a60d8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 11:30:04 +1000 Subject: [PATCH 2/7] XXX: enable eprintln --- compiler/rustc_monomorphize/src/partitioning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 531644f0b8490..313c38054851a 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -989,7 +989,7 @@ fn debug_dump<'a, 'tcx: 'a>( } }; - debug!("{}", dump()); + eprintln!("{}", dump()); } #[inline(never)] // give this a place in the profiler From 5d06126d377423b75fdfb2452c3c54b89db0db8f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 10:46:13 +1000 Subject: [PATCH 3/7] XXX: inline before merge (including increase to 1500) XXX: split out comment changes in a precursor, eventually --- compiler/rustc_middle/src/mir/mono.rs | 4 -- .../rustc_monomorphize/src/partitioning.rs | 37 ++++++++++--------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs index 1511e25523559..1cbf1a36283f5 100644 --- a/compiler/rustc_middle/src/mir/mono.rs +++ b/compiler/rustc_middle/src/mir/mono.rs @@ -335,10 +335,6 @@ impl<'tcx> CodegenUnit<'tcx> { .expect("create_size_estimate must be called before getting a size_estimate") } - pub fn modify_size_estimate(&mut self, delta: usize) { - *self.size_estimate.as_mut().unwrap() += delta; - } - pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool { self.items().contains_key(item) } diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 313c38054851a..6c37b572dc748 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -166,19 +166,9 @@ where placed }; - // Merge until we have at most `max_cgu_count` codegen units. - // `merge_codegen_units` is responsible for updating the CGU size - // estimates. - { - let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus"); - merge_codegen_units(cx, &mut codegen_units); - debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); - } - - // In the next step, we use the inlining map to determine which additional - // monomorphizations have to go into each codegen unit. These additional - // monomorphizations can be drop-glue, functions from external crates, and - // local functions the definition of which is marked with `#[inline]`. + // Use the usage map to put additional mono items in each codegen unit: + // drop-glue, functions from external crates, and local functions the + // definition of which is marked with `#[inline]`. { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_inline_items"); place_inlined_mono_items(cx, &mut codegen_units); @@ -190,8 +180,17 @@ where debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats); } - // Next we try to make as many symbols "internal" as possible, so LLVM has - // more freedom to optimize. + // Merge until we have at most `max_cgu_count` codegen units. + // `merge_codegen_units` is responsible for updating the CGU size + // estimates. + { + let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus"); + merge_codegen_units(cx, &mut codegen_units); + debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); + } + + // Make as many symbols "internal" as possible, so LLVM has more freedom to + // optimize. if !tcx.sess.link_dead_code() { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_internalize_symbols"); internalize_symbols(cx, &mut codegen_units, internalization_candidates); @@ -314,7 +313,7 @@ fn merge_codegen_units<'tcx>( // worse generated code. So we don't allow CGUs smaller than this (unless // there is just one CGU, of course). Note that CGU sizes of 100,000+ are // common in larger programs, so this isn't all that large. - const NON_INCR_MIN_CGU_SIZE: usize = 1000; + const NON_INCR_MIN_CGU_SIZE: usize = 1500; // Repeatedly merge the two smallest codegen units as long as: // - we have more CGUs than the upper limit, or @@ -338,9 +337,11 @@ fn merge_codegen_units<'tcx>( let mut smallest = codegen_units.pop().unwrap(); let second_smallest = codegen_units.last_mut().unwrap(); - // Move the mono-items from `smallest` to `second_smallest` - second_smallest.modify_size_estimate(smallest.size_estimate()); + // Move the items from `smallest` to `second_smallest`. Some of them + // may be duplicate inlined items, in which case the destination CGU is + // unaffected. Recalculate size estimates afterwards. second_smallest.items_mut().extend(smallest.items_mut().drain()); + second_smallest.create_size_estimate(cx.tcx); // Record that `second_smallest` now contains all the stuff that was // in `smallest` before. From eb1af88fdd40fc0ba6f807e1c50f7c5d087695be Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 12:16:45 +1000 Subject: [PATCH 4/7] XXX: max-merge --- .../rustc_monomorphize/src/partitioning.rs | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 6c37b572dc748..601ed43cb39fd 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -319,30 +319,70 @@ fn merge_codegen_units<'tcx>( // - we have more CGUs than the upper limit, or // - (Non-incremental builds only) the user didn't specify a CGU count, and // there are multiple CGUs, and some are below the minimum size. + // - njn: update this comment // // The "didn't specify a CGU count" condition is because when an explicit // count is requested we observe it as closely as possible. For example, // the `compiler_builtins` crate sets `codegen-units = 10000` and it's // critical they aren't merged. Also, some tests use explicit small values // and likewise won't work if small CGUs are merged. - while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize() - || (cx.tcx.sess.opts.incremental.is_none() - && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) - && codegen_units.len() > 1 - && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE)) - { + //eprintln!("-----"); + loop { + // njn: where to put this? // Sort small cgus to the back. codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); + //eprintln!("cgus: {:?}", codegen_units.iter().map(|cgu| cgu.size_estimate()).collect::>()); + + let merge1 = codegen_units.len() > cx.tcx.sess.codegen_units().as_usize(); + + let merge2 = cx.tcx.sess.opts.incremental.is_none() + && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) + && codegen_units.len() >= 2 + && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE); + + // njn: addition is an imperfect measure, could be overlap + let merge3 = cx.tcx.sess.opts.incremental.is_none() + && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) + && codegen_units.len() >= 3 + && { + // eprintln!( + // "sz: {} >= {} + {}?", + // codegen_units[0].size_estimate(), + // codegen_units[codegen_units.len() - 2].size_estimate(), + // codegen_units[codegen_units.len() - 1].size_estimate()); + + codegen_units[0].size_estimate() + >= codegen_units[codegen_units.len() - 2].size_estimate() + + codegen_units[codegen_units.len() - 1].size_estimate() + }; + + if !(merge1 || merge2 || merge3) { + break; + } + let mut smallest = codegen_units.pop().unwrap(); let second_smallest = codegen_units.last_mut().unwrap(); + // let sm_size = smallest.size_estimate(); + // let sec_sm_size = second_smallest.size_estimate(); + // Move the items from `smallest` to `second_smallest`. Some of them // may be duplicate inlined items, in which case the destination CGU is // unaffected. Recalculate size estimates afterwards. second_smallest.items_mut().extend(smallest.items_mut().drain()); second_smallest.create_size_estimate(cx.tcx); + // eprintln!( + // "merge: {} {} {}: {} + {} -> {}", + // merge1, + // merge2, + // merge3, + // sec_sm_size, + // sm_size, + // second_smallest.size_estimate() + // ); + // Record that `second_smallest` now contains all the stuff that was // in `smallest` before. let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap(); From ac47a25aa297f31e678df6494acbbdb9b78939ac Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 15:58:17 +1000 Subject: [PATCH 5/7] XXX: process-by-size --- compiler/rustc_codegen_ssa/src/base.rs | 9 +++------ compiler/rustc_monomorphize/src/partitioning.rs | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index dc4a28c866ff3..023956a6d291d 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -38,11 +38,10 @@ use rustc_span::symbol::sym; use rustc_span::Symbol; use rustc_target::abi::{Align, FIRST_VARIANT}; +use std::cmp; use std::collections::BTreeSet; use std::time::{Duration, Instant}; -use itertools::Itertools; - pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicate { match op { hir::BinOpKind::Eq => IntPredicate::IntEQ, @@ -674,10 +673,8 @@ pub fn codegen_crate( // are large size variations, this can reduce memory usage significantly. let codegen_units: Vec<_> = { let mut sorted_cgus = codegen_units.iter().collect::>(); - sorted_cgus.sort_by_cached_key(|cgu| cgu.size_estimate()); - - let (first_half, second_half) = sorted_cgus.split_at(sorted_cgus.len() / 2); - second_half.iter().rev().interleave(first_half).copied().collect() + sorted_cgus.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); + sorted_cgus }; // Calculate the CGU reuse diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 601ed43cb39fd..ff6add24c6ad5 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -1030,7 +1030,7 @@ fn debug_dump<'a, 'tcx: 'a>( } }; - eprintln!("{}", dump()); + debug!("{}", dump()); } #[inline(never)] // give this a place in the profiler From aff730e1a3fbb9ceca6c2da919a7ae050f6810c4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 16:13:30 +1000 Subject: [PATCH 6/7] XXX: max-merge-90% --- compiler/rustc_monomorphize/src/partitioning.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index ff6add24c6ad5..628b1d506bf4e 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -352,9 +352,9 @@ fn merge_codegen_units<'tcx>( // codegen_units[codegen_units.len() - 2].size_estimate(), // codegen_units[codegen_units.len() - 1].size_estimate()); - codegen_units[0].size_estimate() - >= codegen_units[codegen_units.len() - 2].size_estimate() - + codegen_units[codegen_units.len() - 1].size_estimate() + (codegen_units[0].size_estimate() as f64 * 0.8) + >= (codegen_units[codegen_units.len() - 2].size_estimate() + + codegen_units[codegen_units.len() - 1].size_estimate()) as f64 }; if !(merge1 || merge2 || merge3) { From e8379a70b98aae107ecbbee085538b0b79da145d Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 15 Jun 2023 18:39:57 +1000 Subject: [PATCH 7/7] XXX: more --- compiler/rustc_codegen_ssa/src/base.rs | 9 ++++++--- compiler/rustc_monomorphize/src/partitioning.rs | 9 +++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index 023956a6d291d..dc4a28c866ff3 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -38,10 +38,11 @@ use rustc_span::symbol::sym; use rustc_span::Symbol; use rustc_target::abi::{Align, FIRST_VARIANT}; -use std::cmp; use std::collections::BTreeSet; use std::time::{Duration, Instant}; +use itertools::Itertools; + pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicate { match op { hir::BinOpKind::Eq => IntPredicate::IntEQ, @@ -673,8 +674,10 @@ pub fn codegen_crate( // are large size variations, this can reduce memory usage significantly. let codegen_units: Vec<_> = { let mut sorted_cgus = codegen_units.iter().collect::>(); - sorted_cgus.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); - sorted_cgus + sorted_cgus.sort_by_cached_key(|cgu| cgu.size_estimate()); + + let (first_half, second_half) = sorted_cgus.split_at(sorted_cgus.len() / 2); + second_half.iter().rev().interleave(first_half).copied().collect() }; // Calculate the CGU reuse diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 628b1d506bf4e..155aed4490f10 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -313,7 +313,7 @@ fn merge_codegen_units<'tcx>( // worse generated code. So we don't allow CGUs smaller than this (unless // there is just one CGU, of course). Note that CGU sizes of 100,000+ are // common in larger programs, so this isn't all that large. - const NON_INCR_MIN_CGU_SIZE: usize = 1500; + const NON_INCR_MIN_CGU_SIZE: usize = 2000; // Repeatedly merge the two smallest codegen units as long as: // - we have more CGUs than the upper limit, or @@ -353,9 +353,10 @@ fn merge_codegen_units<'tcx>( // codegen_units[codegen_units.len() - 1].size_estimate()); (codegen_units[0].size_estimate() as f64 * 0.8) - >= (codegen_units[codegen_units.len() - 2].size_estimate() - + codegen_units[codegen_units.len() - 1].size_estimate()) as f64 - }; + >= (codegen_units[codegen_units.len() - 2].size_estimate() + + codegen_units[codegen_units.len() - 1].size_estimate()) + as f64 + }; if !(merge1 || merge2 || merge3) { break;