diff options
author | 2022-05-10 12:47:14 +0200 | |
---|---|---|
committer | 2022-05-10 12:49:02 +0200 | |
commit | b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d (patch) | |
tree | fa8bb4a22671bf742261e443a55030270e015862 /src/vec.rs | |
parent | aacc35997b3eceabd4c9d6226dcc1c9ae4b551f4 (diff) | |
download | heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.gz heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.zst heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.zip |
optimize the codegen of Vec::clone
these changes optimize `Vec<u8, 1024>::clone` down to these operations
1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized
2. zero the `len` field
3. memcpy `len` bytes of data from the parent
analyzed source code
``` rust
use heapless::Vec;
fn clone(vec: &Vec<u8, 1024>) {
let mut vec = vec.clone();
black_box(&mut vec);
}
fn black_box<T>(val: &mut T) {
unsafe { asm!("// {0}", in(reg) val) }
}
```
machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable)
``` armasm
00020100 <clone>:
20100: b5d0 push {r4, r6, r7, lr}
20102: af02 add r7, sp, #8
20104: f5ad 6d81 sub.w sp, sp, #1032 ; 0x408
20108: 2300 movs r3, #0
2010a: c802 ldmia r0!, {r1}
2010c: 9301 str r3, [sp, #4]
2010e: aa01 add r2, sp, #4
20110: /--/-X b141 cbz r1, 20124 <clone+0x24>
20112: | | 4413 add r3, r2
20114: | | f810 4b01 ldrb.w r4, [r0], #1
20118: | | 3901 subs r1, #1
2011a: | | 711c strb r4, [r3, #4]
2011c: | | 9b01 ldr r3, [sp, #4]
2011e: | | 3301 adds r3, #1
20120: | | 9301 str r3, [sp, #4]
20122: | \-- e7f5 b.n 20110 <clone+0x10>
20124: \----> a801 add r0, sp, #4
20126: f50d 6d81 add.w sp, sp, #1032 ; 0x408
2012a: bdd0 pop {r4, r6, r7, pc}
```
note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses
---
before `clone` was optimized to
1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space
2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch
3. memcpy `len` bytes of data from the parent
Diffstat (limited to '')
-rw-r--r-- | src/vec.rs | 19 |
1 files changed, 15 insertions, 4 deletions
@@ -34,12 +34,18 @@ use hash32; /// assert_eq!(*vec, [7, 1, 2, 3]); /// ``` pub struct Vec<T, const N: usize> { - buffer: [MaybeUninit<T>; N], + // NOTE order is important for optimizations. the `len` first layout lets the compiler optimize + // `new` to: reserve stack space and zero the first word. With the fields in the reverse order + // the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer` + // field which should be left uninitialized. Optimizations were last checked with Rust 1.60 len: usize, + + buffer: [MaybeUninit<T>; N], } impl<T, const N: usize> Vec<T, N> { - const INIT: MaybeUninit<T> = MaybeUninit::uninit(); + const ELEM: MaybeUninit<T> = MaybeUninit::uninit(); + const INIT: [MaybeUninit<T>; N] = [Self::ELEM; N]; // important for optimization of `new` /// Constructs a new, empty vector with a fixed capacity of `N` /// @@ -60,8 +66,8 @@ impl<T, const N: usize> Vec<T, N> { crate::sealed::greater_than_eq_0::<N>(); Self { - buffer: [Self::INIT; N], len: 0, + buffer: Self::INIT, } } @@ -92,7 +98,12 @@ impl<T, const N: usize> Vec<T, N> { T: Clone, { let mut new = Self::new(); - new.extend_from_slice(self.as_slice()).unwrap(); + // avoid `extend_from_slice` as that introduces a runtime check / panicking branch + for elem in self { + unsafe { + new.push_unchecked(elem.clone()); + } + } new } |