Description
Hi, I am working with code similar to this:
use std::any::type_name;
use std::mem::ManuallyDrop;
use std::mem::MaybeUninit;
/// The result of a record conversion in a call to [convert_vec_in_place].
pub enum VecElementConversionResult<T> {
/// The record has been converted to the attached value.
Converted(T),
/// The record has been abandonned.
Abandonned,
}
pub fn try_convert_vec_in_place<T, U, C, E>(input: Vec<T>, convert: C) -> Result<Vec<U>, E>
where
C: Fn(T, Option<&mut U>) -> Result<VecElementConversionResult<U>, E>
+ std::panic::RefUnwindSafe,
{
// It would be nice to assert that statically. We could use a trait that indicates the
// invariant but this would have two drawbacks:
//
// - you have to trust the implementations of the trait
// - this would prevent from allowing conversions from any type T to any other type U where
// they both have the same memory layout
//
// Side note: those runtime assertions are optimised statically: either code without
// assertion code (the happy path), or pure panic (the incorrect path).
assert_eq!(
std::mem::size_of::<T>(),
std::mem::size_of::<U>(),
"size_of {} vs {}",
type_name::<T>(),
type_name::<U>()
);
assert_eq!(
std::mem::align_of::<T>(),
std::mem::align_of::<U>(),
"align_of {} vs {}",
type_name::<T>(),
type_name::<U>()
);
// Let's take control, we know what we're doing
let mut manually_drop = ManuallyDrop::new(input);
let slice = manually_drop.as_mut_slice();
// From now on, slice is divided into 3 areas:
//
// - 0..first_moved: elements of type U (to be dropped by the panic handler)
// - first_moved..first_ttt: dropped elements
// - first_ttt..: elements of type T (to be dropped by the panic handler)
//
// This must remain true until the end so that the panic handler drops elements correctly.
let mut first_moved = 0;
let mut first_ttt = 0;
let maybe_panic =
std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| -> Result<(), E> {
while first_ttt < slice.len() {
// Bring one T back into auto-drop land
let ttt = {
let mut ttt = MaybeUninit::<T>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(&slice[first_ttt], ttt.as_mut_ptr(), 1);
}
// The element in the slice is now moved
first_ttt += 1;
unsafe { ttt.assume_init() }
};
// Convert it
let converted = convert(
ttt,
// Pass a mutable reference on the preceeding converted element if it exists
if first_moved > 0 {
Some(unsafe { &mut *(&mut slice[first_moved - 1] as *mut T).cast() })
} else {
None
},
)?;
// Store the result
match converted {
VecElementConversionResult::Converted(uuu) => {
unsafe {
std::ptr::write((&mut slice[first_moved] as *mut T).cast(), uuu);
}
// The element is now converted
first_moved += 1;
}
VecElementConversionResult::Abandonned => {
// The element has been abandonned by the converter
}
}
}
Ok(())
}));
let clean_on_error = || {
// Bring Us back into auto-drop land
for element in &slice[0..first_moved] {
let mut uuu = MaybeUninit::<U>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(&*(element as *const T).cast(), uuu.as_mut_ptr(), 1);
uuu.assume_init();
}
}
// Bring Ts back into auto-drop land
for element in &slice[first_ttt..slice.len()] {
let mut ttt = MaybeUninit::<T>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(element, ttt.as_mut_ptr(), 1);
ttt.assume_init();
}
}
};
match maybe_panic {
Ok(Ok(())) => {
unsafe {
manually_drop.set_len(first_moved);
}
Ok(unsafe {
std::mem::transmute::<Vec<T>, Vec<U>>(ManuallyDrop::into_inner(manually_drop))
})
}
Ok(Err(err)) => {
clean_on_error();
Err(err)
}
Err(err) => {
clean_on_error();
panic!("{:?}", err);
}
}
}
fn main() {
#[unsafe(no_mangle)]
fn should_convert_vec_with_noop(vec_0: Vec<usize>) -> Vec<usize> {
try_convert_vec_in_place(
vec_0,
|record_0, _| -> Result<VecElementConversionResult<usize>, ()> {
Ok(VecElementConversionResult::Converted(record_0))
},
)
.unwrap()
}
let vec_0 = (0..42).collect::<Vec<usize>>();
let vec_1 = should_convert_vec_with_noop(vec_0);
dbg!(vec_1);
}
This code is complex but if you analyze it deeply it basically takes a vector and returns it "as is" due to the specific closure passed to try_convert_vec_in_place
. This is intended.
And I am trying to ensure the generated assembly is optimal in that case. For that I am using cargo-show-asm
.
If I run RUSTFLAGS="-C opt-level=z" cargo asm should_convert_vec_with_noop
I see a lot of panic handling code remaining in the assembly.
Now I transform the clean_on_error
closure into a function with arguments like this:
fn clean_on_error<T, U>(slice: &mut [T], first_moved: usize, first_ttt: usize) {
// Bring Us back into auto-drop land
for element in &slice[0..first_moved] {
let mut uuu = MaybeUninit::<U>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(&*(element as *const T).cast(), uuu.as_mut_ptr(), 1);
uuu.assume_init();
}
}
// Bring Ts back into auto-drop land
for element in &slice[first_ttt..slice.len()] {
let mut ttt = MaybeUninit::<T>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(element, ttt.as_mut_ptr(), 1);
ttt.assume_init();
}
}
}
And I call it like this: clean_on_error::<T, U>(slice, first_moved, first_ttt);
where needed.
If I rerun cargo asm, this time I get an extremely optimized version:
section .text.should_convert_vec_with_noop,"ax",@progbits
.globl should_convert_vec_with_noop
.type should_convert_vec_with_noop,@function
should_convert_vec_with_noop:
.cfi_startproc
mov rax, qword ptr [rsi]
mov rcx, rax
neg rcx
jo .LBB9_2
mov rcx, qword ptr [rsi + 8]
mov rdx, qword ptr [rsi + 16]
mov qword ptr [rdi], rax
mov qword ptr [rdi + 8], rcx
mov qword ptr [rdi + 16], rdx
mov rax, rdi
ret
.LBB9_2:
push rax
.cfi_def_cfa_offset 16
lea rdi, [rip + .Lanon.9284ee2a77900d7817deb55a6ed67d77.6]
lea rcx, [rip + .Lanon.9284ee2a77900d7817deb55a6ed67d77.5]
lea r8, [rip + .Lanon.9284ee2a77900d7817deb55a6ed67d77.11]
push 43
.cfi_adjust_cfa_offset 8
pop rsi
.cfi_adjust_cfa_offset -8
lea rdx, [rsp + 7]
call qword ptr [rip + core::result::unwrap_failed@GOTPCREL]
Question is, why, sometimes and for unclear reasons, does it not detect that the code can never enter the error branches and not detect it can be highly optimized? And are there cases the compiler should/could detect so that the optimization is less dependent on small details of implementation?
In my real project I tried with a closure taking all data as arguments, it worked like the function, but as soon as one variable is referenced by the closure the code becomes sub-optimal.
Also in my real project I did not need opt-level=z
, the default release optimization was enough, it's when I wanted to reduce the size of the test case that I needed it.
For reference, my real case is at arnodb/truc@024330f but the commit might not exist any more in the future. It is basically the same as the above but embedded in some test code where the NOOP is even more complex.
Meta
rustc --version --verbose
:
rustc 1.87.0 (17067e9ac 2025-05-09)
binary: rustc
commit-hash: 17067e9ac6d7ecb70e50f92c1944e545188d2359
commit-date: 2025-05-09
host: x86_64-unknown-linux-gnu
release: 1.87.0
LLVM version: 20.1.1
Same behaviour with beta
and nightly
at the time of writing.