optimize the codegen of Vec::clone

these changes optimize `Vec<u8, 1024>::clone` down to these operations 1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized 2. zero the `len` field 3. memcpy `len` bytes of data from the parent analyzed source code ``` rust use heapless::Vec; fn clone(vec: &Vec<u8, 1024>) { let mut vec = vec.clone(); black_box(&mut vec); } fn black_box<T>(val: &mut T) { unsafe { asm!("// {0}", in(reg) val) } } ``` machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable) ``` armasm 00020100 <clone>: 20100: b5d0 push {r4, r6, r7, lr} 20102: af02 add r7, sp, #8 20104: f5ad 6d81 sub.w sp, sp, #1032 ; 0x408 20108: 2300 movs r3, #0 2010a: c802 ldmia r0!, {r1} 2010c: 9301 str r3, [sp, #4] 2010e: aa01 add r2, sp, #4 20110: /--/-X b141 cbz r1, 20124 <clone+0x24> 20112: | | 4413 add r3, r2 20114: | | f810 4b01 ldrb.w r4, [r0], #1 20118: | | 3901 subs r1, #1 2011a: | | 711c strb r4, [r3, #4] 2011c: | | 9b01 ldr r3, [sp, #4] 2011e: | | 3301 adds r3, #1 20120: | | 9301 str r3, [sp, #4] 20122: | \-- e7f5 b.n 20110 <clone+0x10> 20124: \----> a801 add r0, sp, #4 20126: f50d 6d81 add.w sp, sp, #1032 ; 0x408 2012a: bdd0 pop {r4, r6, r7, pc} ``` note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses --- before `clone` was optimized to 1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space 2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch 3. memcpy `len` bytes of data from the parent
author: Jorge Aparicio <jorge.aparicio@ferrous-systems.com> 2022-05-10 12:47:14 +0200
committer: Jorge Aparicio <jorge.aparicio@ferrous-systems.com> 2022-05-10 12:49:02 +0200
commit: b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d (patch)
tree: fa8bb4a22671bf742261e443a55030270e015862 /src/vec.rs
parent: aacc35997b3eceabd4c9d6226dcc1c9ae4b551f4 (diff)
download: heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.gz
heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.zst
heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.zip
1 files changed, 15 insertions, 4 deletions
diff --git a/src/vec.rs b/src/vec.rs
index 31d98ca7..bef2edd0 100644
--- a/src/vec.rs
+++ b/src/vec.rs
@@ -34,12 +34,18 @@ use hash32;
 /// assert_eq!(*vec, [7, 1, 2, 3]);
 /// ```
 pub struct Vec<T, const N: usize> {
-    buffer: [MaybeUninit<T>; N],
+    // NOTE order is important for optimizations. the `len` first layout lets the compiler optimize
+    // `new` to: reserve stack space and zero the first word. With the fields in the reverse order
+    // the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer`
+    // field which should be left uninitialized. Optimizations were last checked with Rust 1.60
     len: usize,
+
+    buffer: [MaybeUninit<T>; N],
 }
 
 impl<T, const N: usize> Vec<T, N> {
-    const INIT: MaybeUninit<T> = MaybeUninit::uninit();
+    const ELEM: MaybeUninit<T> = MaybeUninit::uninit();
+    const INIT: [MaybeUninit<T>; N] = [Self::ELEM; N]; // important for optimization of `new`
 
     /// Constructs a new, empty vector with a fixed capacity of `N`
     ///
@@ -60,8 +66,8 @@ impl<T, const N: usize> Vec<T, N> {
         crate::sealed::greater_than_eq_0::<N>();
 
         Self {
-            buffer: [Self::INIT; N],
             len: 0,
+            buffer: Self::INIT,
         }
     }
 
@@ -92,7 +98,12 @@ impl<T, const N: usize> Vec<T, N> {
         T: Clone,
     {
         let mut new = Self::new();
-        new.extend_from_slice(self.as_slice()).unwrap();
+        // avoid `extend_from_slice` as that introduces a runtime check / panicking branch
+        for elem in self {
+            unsafe {
+                new.push_unchecked(elem.clone());
+            }
+        }
         new
     }
author	Jorge Aparicio <jorge.aparicio@ferrous-systems.com>	2022-05-10 12:47:14 +0200
committer	Jorge Aparicio <jorge.aparicio@ferrous-systems.com>	2022-05-10 12:49:02 +0200
commit	b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d (patch)
tree	fa8bb4a22671bf742261e443a55030270e015862 /src/vec.rs
parent	aacc35997b3eceabd4c9d6226dcc1c9ae4b551f4 (diff)
download	heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.gz heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.zst heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.zip