summaryrefslogtreecommitdiff
path: root/src/vec.rs
diff options
context:
space:
mode:
authorGravatar Jorge Aparicio <jorge.aparicio@ferrous-systems.com> 2022-05-10 12:47:14 +0200
committerGravatar Jorge Aparicio <jorge.aparicio@ferrous-systems.com> 2022-05-10 12:49:02 +0200
commitb2268e7885fbddeb17a9eae5d9f4b9aba099ff7d (patch)
treefa8bb4a22671bf742261e443a55030270e015862 /src/vec.rs
parentaacc35997b3eceabd4c9d6226dcc1c9ae4b551f4 (diff)
downloadheapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.gz
heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.tar.zst
heapless-b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d.zip
optimize the codegen of Vec::clone
these changes optimize `Vec<u8, 1024>::clone` down to these operations 1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized 2. zero the `len` field 3. memcpy `len` bytes of data from the parent analyzed source code ``` rust use heapless::Vec; fn clone(vec: &Vec<u8, 1024>) { let mut vec = vec.clone(); black_box(&mut vec); } fn black_box<T>(val: &mut T) { unsafe { asm!("// {0}", in(reg) val) } } ``` machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable) ``` armasm 00020100 <clone>: 20100: b5d0 push {r4, r6, r7, lr} 20102: af02 add r7, sp, #8 20104: f5ad 6d81 sub.w sp, sp, #1032 ; 0x408 20108: 2300 movs r3, #0 2010a: c802 ldmia r0!, {r1} 2010c: 9301 str r3, [sp, #4] 2010e: aa01 add r2, sp, #4 20110: /--/-X b141 cbz r1, 20124 <clone+0x24> 20112: | | 4413 add r3, r2 20114: | | f810 4b01 ldrb.w r4, [r0], #1 20118: | | 3901 subs r1, #1 2011a: | | 711c strb r4, [r3, #4] 2011c: | | 9b01 ldr r3, [sp, #4] 2011e: | | 3301 adds r3, #1 20120: | | 9301 str r3, [sp, #4] 20122: | \-- e7f5 b.n 20110 <clone+0x10> 20124: \----> a801 add r0, sp, #4 20126: f50d 6d81 add.w sp, sp, #1032 ; 0x408 2012a: bdd0 pop {r4, r6, r7, pc} ``` note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses --- before `clone` was optimized to 1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space 2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch 3. memcpy `len` bytes of data from the parent
Diffstat (limited to '')
-rw-r--r--src/vec.rs19
1 files changed, 15 insertions, 4 deletions
diff --git a/src/vec.rs b/src/vec.rs
index 31d98ca7..bef2edd0 100644
--- a/src/vec.rs
+++ b/src/vec.rs
@@ -34,12 +34,18 @@ use hash32;
/// assert_eq!(*vec, [7, 1, 2, 3]);
/// ```
pub struct Vec<T, const N: usize> {
- buffer: [MaybeUninit<T>; N],
+ // NOTE order is important for optimizations. the `len` first layout lets the compiler optimize
+ // `new` to: reserve stack space and zero the first word. With the fields in the reverse order
+ // the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer`
+ // field which should be left uninitialized. Optimizations were last checked with Rust 1.60
len: usize,
+
+ buffer: [MaybeUninit<T>; N],
}
impl<T, const N: usize> Vec<T, N> {
- const INIT: MaybeUninit<T> = MaybeUninit::uninit();
+ const ELEM: MaybeUninit<T> = MaybeUninit::uninit();
+ const INIT: [MaybeUninit<T>; N] = [Self::ELEM; N]; // important for optimization of `new`
/// Constructs a new, empty vector with a fixed capacity of `N`
///
@@ -60,8 +66,8 @@ impl<T, const N: usize> Vec<T, N> {
crate::sealed::greater_than_eq_0::<N>();
Self {
- buffer: [Self::INIT; N],
len: 0,
+ buffer: Self::INIT,
}
}
@@ -92,7 +98,12 @@ impl<T, const N: usize> Vec<T, N> {
T: Clone,
{
let mut new = Self::new();
- new.extend_from_slice(self.as_slice()).unwrap();
+ // avoid `extend_from_slice` as that introduces a runtime check / panicking branch
+ for elem in self {
+ unsafe {
+ new.push_unchecked(elem.clone());
+ }
+ }
new
}