From 278a0ae4e3855dfaa4bb43bd5f35ec292f1d6a23 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 10:54:05 +1000
Subject: [PATCH 1/7] XXX: nicer merging code (extend instead of insert)

---
 compiler/rustc_monomorphize/src/partitioning.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index a74ba8e4a4be9..531644f0b8490 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -340,9 +340,7 @@ fn merge_codegen_units<'tcx>(
 
         // Move the mono-items from `smallest` to `second_smallest`
         second_smallest.modify_size_estimate(smallest.size_estimate());
-        for (k, v) in smallest.items_mut().drain() {
-            second_smallest.items_mut().insert(k, v);
-        }
+        second_smallest.items_mut().extend(smallest.items_mut().drain());
 
         // Record that `second_smallest` now contains all the stuff that was
         // in `smallest` before.

From 3e127061dad7c119c6b3ba77d625b33a4f5a60d8 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 11:30:04 +1000
Subject: [PATCH 2/7] XXX: enable eprintln

---
 compiler/rustc_monomorphize/src/partitioning.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 531644f0b8490..313c38054851a 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -989,7 +989,7 @@ fn debug_dump<'a, 'tcx: 'a>(
         }
     };
 
-    debug!("{}", dump());
+    eprintln!("{}", dump());
 }
 
 #[inline(never)] // give this a place in the profiler

From 5d06126d377423b75fdfb2452c3c54b89db0db8f Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 10:46:13 +1000
Subject: [PATCH 3/7] XXX: inline before merge (including increase to 1500)

XXX: split out comment changes in a precursor, eventually
---
 compiler/rustc_middle/src/mir/mono.rs         |  4 --
 .../rustc_monomorphize/src/partitioning.rs    | 37 ++++++++++---------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 1511e25523559..1cbf1a36283f5 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -335,10 +335,6 @@ impl<'tcx> CodegenUnit<'tcx> {
             .expect("create_size_estimate must be called before getting a size_estimate")
     }
 
-    pub fn modify_size_estimate(&mut self, delta: usize) {
-        *self.size_estimate.as_mut().unwrap() += delta;
-    }
-
     pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool {
         self.items().contains_key(item)
     }
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 313c38054851a..6c37b572dc748 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -166,19 +166,9 @@ where
         placed
     };
 
-    // Merge until we have at most `max_cgu_count` codegen units.
-    // `merge_codegen_units` is responsible for updating the CGU size
-    // estimates.
-    {
-        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
-        merge_codegen_units(cx, &mut codegen_units);
-        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
-    }
-
-    // In the next step, we use the inlining map to determine which additional
-    // monomorphizations have to go into each codegen unit. These additional
-    // monomorphizations can be drop-glue, functions from external crates, and
-    // local functions the definition of which is marked with `#[inline]`.
+    // Use the usage map to put additional mono items in each codegen unit:
+    // drop-glue, functions from external crates, and local functions the
+    // definition of which is marked with `#[inline]`.
     {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_inline_items");
         place_inlined_mono_items(cx, &mut codegen_units);
@@ -190,8 +180,17 @@ where
         debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
     }
 
-    // Next we try to make as many symbols "internal" as possible, so LLVM has
-    // more freedom to optimize.
+    // Merge until we have at most `max_cgu_count` codegen units.
+    // `merge_codegen_units` is responsible for updating the CGU size
+    // estimates.
+    {
+        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
+        merge_codegen_units(cx, &mut codegen_units);
+        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
+    }
+
+    // Make as many symbols "internal" as possible, so LLVM has more freedom to
+    // optimize.
     if !tcx.sess.link_dead_code() {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_internalize_symbols");
         internalize_symbols(cx, &mut codegen_units, internalization_candidates);
@@ -314,7 +313,7 @@ fn merge_codegen_units<'tcx>(
     // worse generated code. So we don't allow CGUs smaller than this (unless
     // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
     // common in larger programs, so this isn't all that large.
-    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+    const NON_INCR_MIN_CGU_SIZE: usize = 1500;
 
     // Repeatedly merge the two smallest codegen units as long as:
     // - we have more CGUs than the upper limit, or
@@ -338,9 +337,11 @@ fn merge_codegen_units<'tcx>(
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
-        // Move the mono-items from `smallest` to `second_smallest`
-        second_smallest.modify_size_estimate(smallest.size_estimate());
+        // Move the items from `smallest` to `second_smallest`. Some of them
+        // may be duplicate inlined items, in which case the destination CGU is
+        // unaffected. Recalculate size estimates afterwards.
         second_smallest.items_mut().extend(smallest.items_mut().drain());
+        second_smallest.create_size_estimate(cx.tcx);
 
         // Record that `second_smallest` now contains all the stuff that was
         // in `smallest` before.

From eb1af88fdd40fc0ba6f807e1c50f7c5d087695be Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 12:16:45 +1000
Subject: [PATCH 4/7] XXX: max-merge

---
 .../rustc_monomorphize/src/partitioning.rs    | 52 ++++++++++++++++---
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 6c37b572dc748..601ed43cb39fd 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -319,30 +319,70 @@ fn merge_codegen_units<'tcx>(
     // - we have more CGUs than the upper limit, or
     // - (Non-incremental builds only) the user didn't specify a CGU count, and
     //   there are multiple CGUs, and some are below the minimum size.
+    // - njn: update this comment
     //
     // The "didn't specify a CGU count" condition is because when an explicit
     // count is requested we observe it as closely as possible. For example,
     // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
     // critical they aren't merged. Also, some tests use explicit small values
     // and likewise won't work if small CGUs are merged.
-    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
-        || (cx.tcx.sess.opts.incremental.is_none()
-            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
-            && codegen_units.len() > 1
-            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
-    {
+    //eprintln!("-----");
+    loop {
+        // njn: where to put this?
         // Sort small cgus to the back.
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
 
+        //eprintln!("cgus: {:?}", codegen_units.iter().map(|cgu| cgu.size_estimate()).collect::<Vec<_>>());
+
+        let merge1 = codegen_units.len() > cx.tcx.sess.codegen_units().as_usize();
+
+        let merge2 = cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() >= 2
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE);
+
+        // njn: addition is an imperfect measure, could be overlap
+        let merge3 = cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() >= 3
+            && {
+                // eprintln!(
+                //     "sz: {} >= {} + {}?",
+                //     codegen_units[0].size_estimate(),
+                //     codegen_units[codegen_units.len() - 2].size_estimate(),
+                //     codegen_units[codegen_units.len() - 1].size_estimate());
+
+                codegen_units[0].size_estimate()
+                >= codegen_units[codegen_units.len() - 2].size_estimate()
+                    + codegen_units[codegen_units.len() - 1].size_estimate()
+               };
+
+        if !(merge1 || merge2 || merge3) {
+            break;
+        }
+
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
+        // let sm_size = smallest.size_estimate();
+        // let sec_sm_size = second_smallest.size_estimate();
+
         // Move the items from `smallest` to `second_smallest`. Some of them
         // may be duplicate inlined items, in which case the destination CGU is
         // unaffected. Recalculate size estimates afterwards.
         second_smallest.items_mut().extend(smallest.items_mut().drain());
         second_smallest.create_size_estimate(cx.tcx);
 
+        // eprintln!(
+        //     "merge: {} {} {}: {} + {} -> {}",
+        //     merge1,
+        //     merge2,
+        //     merge3,
+        //     sec_sm_size,
+        //     sm_size,
+        //     second_smallest.size_estimate()
+        // );
+
         // Record that `second_smallest` now contains all the stuff that was
         // in `smallest` before.
         let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap();

From ac47a25aa297f31e678df6494acbbdb9b78939ac Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 15:58:17 +1000
Subject: [PATCH 5/7] XXX: process-by-size

---
 compiler/rustc_codegen_ssa/src/base.rs          | 9 +++------
 compiler/rustc_monomorphize/src/partitioning.rs | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index dc4a28c866ff3..023956a6d291d 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -38,11 +38,10 @@ use rustc_span::symbol::sym;
 use rustc_span::Symbol;
 use rustc_target::abi::{Align, FIRST_VARIANT};
 
+use std::cmp;
 use std::collections::BTreeSet;
 use std::time::{Duration, Instant};
 
-use itertools::Itertools;
-
 pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicate {
     match op {
         hir::BinOpKind::Eq => IntPredicate::IntEQ,
@@ -674,10 +673,8 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
     // are large size variations, this can reduce memory usage significantly.
     let codegen_units: Vec<_> = {
         let mut sorted_cgus = codegen_units.iter().collect::<Vec<_>>();
-        sorted_cgus.sort_by_cached_key(|cgu| cgu.size_estimate());
-
-        let (first_half, second_half) = sorted_cgus.split_at(sorted_cgus.len() / 2);
-        second_half.iter().rev().interleave(first_half).copied().collect()
+        sorted_cgus.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+        sorted_cgus
     };
 
     // Calculate the CGU reuse
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 601ed43cb39fd..ff6add24c6ad5 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -1030,7 +1030,7 @@ fn debug_dump<'a, 'tcx: 'a>(
         }
     };
 
-    eprintln!("{}", dump());
+    debug!("{}", dump());
 }
 
 #[inline(never)] // give this a place in the profiler

From aff730e1a3fbb9ceca6c2da919a7ae050f6810c4 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 16:13:30 +1000
Subject: [PATCH 6/7] XXX: max-merge-90%

---
 compiler/rustc_monomorphize/src/partitioning.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index ff6add24c6ad5..628b1d506bf4e 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -352,9 +352,9 @@ fn merge_codegen_units<'tcx>(
                 //     codegen_units[codegen_units.len() - 2].size_estimate(),
                 //     codegen_units[codegen_units.len() - 1].size_estimate());
 
-                codegen_units[0].size_estimate()
-                >= codegen_units[codegen_units.len() - 2].size_estimate()
-                    + codegen_units[codegen_units.len() - 1].size_estimate()
+                (codegen_units[0].size_estimate() as f64 * 0.8)
+                >= (codegen_units[codegen_units.len() - 2].size_estimate()
+                    + codegen_units[codegen_units.len() - 1].size_estimate()) as f64
                };
 
         if !(merge1 || merge2 || merge3) {

From e8379a70b98aae107ecbbee085538b0b79da145d Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 15 Jun 2023 18:39:57 +1000
Subject: [PATCH 7/7] XXX: more

---
 compiler/rustc_codegen_ssa/src/base.rs          | 9 ++++++---
 compiler/rustc_monomorphize/src/partitioning.rs | 9 +++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index 023956a6d291d..dc4a28c866ff3 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -38,10 +38,11 @@ use rustc_span::symbol::sym;
 use rustc_span::Symbol;
 use rustc_target::abi::{Align, FIRST_VARIANT};
 
-use std::cmp;
 use std::collections::BTreeSet;
 use std::time::{Duration, Instant};
 
+use itertools::Itertools;
+
 pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicate {
     match op {
         hir::BinOpKind::Eq => IntPredicate::IntEQ,
@@ -673,8 +674,10 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
     // are large size variations, this can reduce memory usage significantly.
     let codegen_units: Vec<_> = {
         let mut sorted_cgus = codegen_units.iter().collect::<Vec<_>>();
-        sorted_cgus.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
-        sorted_cgus
+        sorted_cgus.sort_by_cached_key(|cgu| cgu.size_estimate());
+
+        let (first_half, second_half) = sorted_cgus.split_at(sorted_cgus.len() / 2);
+        second_half.iter().rev().interleave(first_half).copied().collect()
     };
 
     // Calculate the CGU reuse
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 628b1d506bf4e..155aed4490f10 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -313,7 +313,7 @@ fn merge_codegen_units<'tcx>(
     // worse generated code. So we don't allow CGUs smaller than this (unless
     // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
     // common in larger programs, so this isn't all that large.
-    const NON_INCR_MIN_CGU_SIZE: usize = 1500;
+    const NON_INCR_MIN_CGU_SIZE: usize = 2000;
 
     // Repeatedly merge the two smallest codegen units as long as:
     // - we have more CGUs than the upper limit, or
@@ -353,9 +353,10 @@ fn merge_codegen_units<'tcx>(
                 //     codegen_units[codegen_units.len() - 1].size_estimate());
 
                 (codegen_units[0].size_estimate() as f64 * 0.8)
-                >= (codegen_units[codegen_units.len() - 2].size_estimate()
-                    + codegen_units[codegen_units.len() - 1].size_estimate()) as f64
-               };
+                    >= (codegen_units[codegen_units.len() - 2].size_estimate()
+                        + codegen_units[codegen_units.len() - 1].size_estimate())
+                        as f64
+            };
 
         if !(merge1 || merge2 || merge3) {
             break;