From 64c2a7412b73768081d35354fd959a8ca028bccb Mon Sep 17 00:00:00 2001
From: Stepan Koltsov <stepan.koltsov@gmail.com>
Date: Sat, 5 Apr 2014 22:14:46 +0000
Subject: [PATCH] Optimize Once::doit when initialization is already completed

* Load is much cheaper than fetch_add, at least on x86_64.
* Common path of `doit` can be inlined

Verified with this test:

```
static mut o: one::Once = one::ONCE_INIT;
loop {
    unsafe {
        let start = time::precise_time_ns();
        let iters = 50000000u64;
        for _ in range(0, iters) {
            o.doit(|| { println!("once!"); });
        }
        let end = time::precise_time_ns();
        let ps_per_iter = 1000 * (end - start) / iters;
        println!("{} ps per iter", ps_per_iter);

        // confuse the optimizer
        o.doit(|| { println!("once!"); });
    }
}
```

Test executed on Mac, Intel Core i7 2GHz. Result is 700ps per iteration
with patch applied, and 17000ps per iteration without patch.
---
 src/libsync/one.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/libsync/one.rs b/src/libsync/one.rs
index 7da6f39b840e5..388c63becbfd2 100644
--- a/src/libsync/one.rs
+++ b/src/libsync/one.rs
@@ -63,7 +63,17 @@ impl Once {
     ///
     /// When this function returns, it is guaranteed that some initialization
     /// has run and completed (it may not be the closure specified).
+    #[inline(always)]
     pub fn doit(&self, f: ||) {
+        // Optimize common path: load is much cheaper than fetch_add.
+        if self.cnt.load(atomics::SeqCst) < 0 {
+            return
+        }
+
+        self.doit_slow(f);
+    }
+
+    fn doit_slow(&self, f: ||) {
         // Implementation-wise, this would seem like a fairly trivial primitive.
         // The stickler part is where our mutexes currently require an
         // allocation, and usage of a `Once` should't leak this allocation.