1 files changed, 794 insertions, 0 deletions
diff --git a/src/thread_pool.zig b/src/thread_pool.zig
new file mode 100644
index 000000000..38c102a77
--- /dev/null
+++ b/src/thread_pool.zig
@@ -0,0 +1,794 @@
+// Thank you @kprotty.
+// https://github.com/kprotty/zap/blob/blog/src/thread_pool.zig
+
+const std = @import("std");
+const ThreadPool = @This();
+const Futex = @import("./futex.zig");
+
+const assert = std.debug.assert;
+const Atomic = std.atomic.Atomic;
+
+stack_size: u32,
+max_threads: u32,
+sync: Atomic(u32) = Atomic(u32).init(@bitCast(u32, Sync{})),
+idle_event: Event = .{},
+join_event: Event = .{},
+run_queue: Node.Queue = .{},
+threads: Atomic(?*Thread) = Atomic(?*Thread).init(null),
+
+const Sync = packed struct {
+    /// Tracks the number of threads not searching for Tasks
+    idle: u14 = 0,
+    /// Tracks the number of threads spawned
+    spawned: u14 = 0,
+    /// What you see is what you get
+    unused: bool = false,
+    /// Used to not miss notifications while state = waking
+    notified: bool = false,
+    /// The current state of the thread pool
+    state: enum(u2) {
+        /// A notification can be issued to wake up a sleeping as the "waking thread".
+        pending = 0,
+        /// The state was notifiied with a signal. A thread is woken up.
+        /// The first thread to transition to `waking` becomes the "waking thread".
+        signaled,
+        /// There is a "waking thread" among us.
+        /// No other thread should be woken up until the waking thread transitions the state.
+        waking,
+        /// The thread pool was terminated. Start decremented `spawned` so that it can be joined.
+        shutdown,
+    } = .pending,
+};
+
+/// Configuration options for the thread pool.
+/// TODO: add CPU core affinity?
+pub const Config = struct {
+    stack_size: u32 = (std.Thread.SpawnConfig{}).stack_size,
+    max_threads: u32,
+};
+
+/// Statically initialize the thread pool using the configuration.
+pub fn init(config: Config) ThreadPool {
+    return .{
+        .stack_size = std.math.max(1, config.stack_size),
+        .max_threads = std.math.max(1, config.max_threads),
+    };
+}
+
+/// Wait for a thread to call shutdown() on the thread pool and kill the worker threads.
+pub fn deinit(self: *ThreadPool) void {
+    self.join();
+    self.* = undefined;
+}
+
+/// A Task represents the unit of Work / Job / Execution that the ThreadPool schedules.
+/// The user provides a `callback` which is invoked when the *Task can run on a thread.
+pub const Task = struct {
+    node: Node = .{},
+    callback: fn (*Task) void,
+};
+
+/// An unordered collection of Tasks which can be submitted for scheduling as a group.
+pub const Batch = struct {
+    len: usize = 0,
+    head: ?*Task = null,
+    tail: ?*Task = null,
+
+    /// Create a batch from a single task. 
+    pub fn from(task: *Task) Batch {
+        return Batch{
+            .len = 1,
+            .head = task,
+            .tail = task,
+        };
+    }
+
+    /// Another batch into this one, taking ownership of its tasks.
+    pub fn push(self: *Batch, batch: Batch) void {
+        if (batch.len == 0) return;
+        if (self.len == 0) {
+            self.* = batch;
+        } else {
+            self.tail.?.node.next = if (batch.head) |h| &h.node else null;
+            self.tail = batch.tail;
+            self.len += batch.len;
+        }
+    }
+};
+
+/// Schedule a batch of tasks to be executed by some thread on the thread pool.
+pub fn schedule(self: *ThreadPool, batch: Batch) void {
+    // Sanity check
+    if (batch.len == 0) {
+        return;
+    }
+
+    // Extract out the Node's from the Tasks
+    var list = Node.List{
+        .head = &batch.head.?.node,
+        .tail = &batch.tail.?.node,
+    };
+
+    // Push the task Nodes to the most approriate queue
+    if (Thread.current) |thread| {
+        thread.run_buffer.push(&list) catch thread.run_queue.push(list);
+    } else {
+        self.run_queue.push(list);
+    }
+
+    // Try to notify a thread
+    const is_waking = false;
+    return self.notify(is_waking);
+}
+
+inline fn notify(self: *ThreadPool, is_waking: bool) void {
+    // Fast path to check the Sync state to avoid calling into notifySlow().
+    // If we're waking, then we need to update the state regardless
+    if (!is_waking) {
+        const sync = @bitCast(Sync, self.sync.load(.Monotonic));
+        if (sync.notified) {
+            return;
+        }
+    }
+
+    return self.notifySlow(is_waking);
+}
+
+noinline fn notifySlow(self: *ThreadPool, is_waking: bool) void {
+    var sync = @bitCast(Sync, self.sync.load(.Monotonic));
+    while (sync.state != .shutdown) {
+        const can_wake = is_waking or (sync.state == .pending);
+        if (is_waking) {
+            assert(sync.state == .waking);
+        }
+
+        var new_sync = sync;
+        new_sync.notified = true;
+        if (can_wake and sync.idle > 0) { // wake up an idle thread
+            new_sync.state = .signaled;
+        } else if (can_wake and sync.spawned < self.max_threads) { // spawn a new thread
+            new_sync.state = .signaled;
+            new_sync.spawned += 1;
+        } else if (is_waking) { // no other thread to pass on "waking" status
+            new_sync.state = .pending;
+        } else if (sync.notified) { // nothing to update
+            return;
+        }
+
+        // Release barrier synchronizes with Acquire in wait()
+        // to ensure pushes to run queues happen before observing a posted notification.
+        sync = @bitCast(Sync, self.sync.tryCompareAndSwap(
+            @bitCast(u32, sync),
+            @bitCast(u32, new_sync),
+            .Release,
+            .Monotonic,
+        ) orelse {
+            // We signaled to notify an idle thread
+            if (can_wake and sync.idle > 0) {
+                return self.idle_event.notify();
+            }
+
+            // We signaled to spawn a new thread
+            if (can_wake and sync.spawned < self.max_threads) {
+                const spawn_config = std.Thread.SpawnConfig{ .stack_size = self.stack_size };
+                const thread = std.Thread.spawn(spawn_config, Thread.run, .{self}) catch return self.unregister(null);
+                return thread.detach();
+            }
+
+            return;
+        });
+    }
+}
+
+noinline fn wait(self: *ThreadPool, _is_waking: bool) error{Shutdown}!bool {
+    var is_idle = false;
+    var is_waking = _is_waking;
+    var sync = @bitCast(Sync, self.sync.load(.Monotonic));
+
+    while (true) {
+        if (sync.state == .shutdown) return error.Shutdown;
+        if (is_waking) assert(sync.state == .waking);
+
+        // Consume a notification made by notify().
+        if (sync.notified) {
+            var new_sync = sync;
+            new_sync.notified = false;
+            if (is_idle)
+                new_sync.idle -= 1;
+            if (sync.state == .signaled)
+                new_sync.state = .waking;
+
+            // Acquire barrier synchronizes with notify()
+            // to ensure that pushes to run queue are observed after wait() returns.
+            sync = @bitCast(Sync, self.sync.tryCompareAndSwap(
+                @bitCast(u32, sync),
+                @bitCast(u32, new_sync),
+                .Acquire,
+                .Monotonic,
+            ) orelse {
+                return is_waking or (sync.state == .signaled);
+            });
+
+            // No notification to consume.
+            // Mark this thread as idle before sleeping on the idle_event.
+        } else if (!is_idle) {
+            var new_sync = sync;
+            new_sync.idle += 1;
+            if (is_waking)
+                new_sync.state = .pending;
+
+            sync = @bitCast(Sync, self.sync.tryCompareAndSwap(
+                @bitCast(u32, sync),
+                @bitCast(u32, new_sync),
+                .Monotonic,
+                .Monotonic,
+            ) orelse {
+                is_waking = false;
+                is_idle = true;
+                continue;
+            });
+
+            // Wait for a signal by either notify() or shutdown() without wasting cpu cycles.
+            // TODO: Add I/O polling here.
+        } else {
+            self.idle_event.wait();
+            sync = @bitCast(Sync, self.sync.load(.Monotonic));
+        }
+    }
+}
+
+/// Marks the thread pool as shutdown
+pub noinline fn shutdown(self: *ThreadPool) void {
+    var sync = @bitCast(Sync, self.sync.load(.Monotonic));
+    while (sync.state != .shutdown) {
+        var new_sync = sync;
+        new_sync.notified = true;
+        new_sync.state = .shutdown;
+        new_sync.idle = 0;
+
+        // Full barrier to synchronize with both wait() and notify()
+        sync = @bitCast(Sync, self.sync.tryCompareAndSwap(
+            @bitCast(u32, sync),
+            @bitCast(u32, new_sync),
+            .AcqRel,
+            .Monotonic,
+        ) orelse {
+            // Wake up any threads sleeping on the idle_event.
+            // TODO: I/O polling notification here.
+            if (sync.idle > 0) self.idle_event.shutdown();
+            return;
+        });
+    }
+}
+
+fn register(noalias self: *ThreadPool, noalias thread: *Thread) void {
+    // Push the thread onto the threads stack in a lock-free manner.
+    var threads = self.threads.load(.Monotonic);
+    while (true) {
+        thread.next = threads;
+        threads = self.threads.tryCompareAndSwap(
+            threads,
+            thread,
+            .Release,
+            .Monotonic,
+        ) orelse break;
+    }
+}
+
+fn unregister(noalias self: *ThreadPool, noalias maybe_thread: ?*Thread) void {
+    // Un-spawn one thread, either due to a failed OS thread spawning or the thread is exitting.
+    const one_spawned = @bitCast(u32, Sync{ .spawned = 1 });
+    const sync = @bitCast(Sync, self.sync.fetchSub(one_spawned, .Release));
+    assert(sync.spawned > 0);
+
+    // The last thread to exit must wake up the thread pool join()er
+    // who will start the chain to shutdown all the threads.
+    if (sync.state == .shutdown and sync.spawned == 1) {
+        self.join_event.notify();
+    }
+
+    // If this is a thread pool thread, wait for a shutdown signal by the thread pool join()er.
+    const thread = maybe_thread orelse return;
+    thread.join_event.wait();
+
+    // After receiving the shutdown signal, shutdown the next thread in the pool.
+    // We have to do that without touching the thread pool itself since it's memory is invalidated by now.
+    // So just follow our .next link.
+    const next_thread = thread.next orelse return;
+    next_thread.join_event.notify();
+}
+
+fn join(self: *ThreadPool) void {
+    // Wait for the thread pool to be shutdown() then for all threads to enter a joinable state
+    var sync = @bitCast(Sync, self.sync.load(.Monotonic));
+    if (!(sync.state == .shutdown and sync.spawned == 0)) {
+        self.join_event.wait();
+        sync = @bitCast(Sync, self.sync.load(.Monotonic));
+    }
+
+    assert(sync.state == .shutdown);
+    assert(sync.spawned == 0);
+
+    // If there are threads, start off the chain sending it the shutdown signal.
+    // The thread receives the shutdown signal and sends it to the next thread, and the next..
+    const thread = self.threads.load(.Acquire) orelse return;
+    thread.join_event.notify();
+}
+
+const Thread = struct {
+    next: ?*Thread = null,
+    target: ?*Thread = null,
+    join_event: Event = .{},
+    run_queue: Node.Queue = .{},
+    run_buffer: Node.Buffer = .{},
+
+    threadlocal var current: ?*Thread = null;
+
+    /// Thread entry point which runs a worker for the ThreadPool
+    fn run(thread_pool: *ThreadPool) void {
+        var self = Thread{};
+        current = &self;
+
+        thread_pool.register(&self);
+        defer thread_pool.unregister(&self);
+
+        var is_waking = false;
+        while (true) {
+            is_waking = thread_pool.wait(is_waking) catch return;
+
+            while (self.pop(thread_pool)) |result| {
+                if (result.pushed or is_waking)
+                    thread_pool.notify(is_waking);
+                is_waking = false;
+
+                const task = @fieldParentPtr(Task, "node", result.node);
+                (task.callback)(task);
+            }
+        }
+    }
+
+    /// Try to dequeue a Node/Task from the ThreadPool.
+    /// Spurious reports of dequeue() returning empty are allowed.
+    fn pop(noalias self: *Thread, noalias thread_pool: *ThreadPool) ?Node.Buffer.Stole {
+        // Check our local buffer first
+        if (self.run_buffer.pop()) |node| {
+            return Node.Buffer.Stole{
+                .node = node,
+                .pushed = false,
+            };
+        }
+
+        // Then check our local queue
+        if (self.run_buffer.consume(&self.run_queue)) |stole| {
+            return stole;
+        }
+
+        // Then the global queue
+        if (self.run_buffer.consume(&thread_pool.run_queue)) |stole| {
+            return stole;
+        }
+
+        // TODO: add optimistic I/O polling here
+
+        // Then try work stealing from other threads
+        var num_threads: u32 = @bitCast(Sync, thread_pool.sync.load(.Monotonic)).spawned;
+        while (num_threads > 0) : (num_threads -= 1) {
+            // Traverse the stack of registered threads on the thread pool
+            const target = self.target orelse thread_pool.threads.load(.Acquire) orelse unreachable;
+            self.target = target.next;
+
+            // Try to steal from their queue first to avoid contention (the target steal's from queue last).
+            if (self.run_buffer.consume(&target.run_queue)) |stole| {
+                return stole;
+            }
+
+            // Skip stealing from the buffer if we're the target.
+            // We still steal from our own queue above given it may have just been locked the first time we tried.
+            if (target == self) {
+                continue;
+            }
+
+            // Steal from the buffer of a remote thread as a last resort
+            if (self.run_buffer.steal(&target.run_buffer)) |stole| {
+                return stole;
+            }
+        }
+
+        return null;
+    }
+};
+
+/// An event which stores 1 semaphore token and is multi-threaded safe.
+/// The event can be shutdown(), waking up all wait()ing threads and 
+/// making subsequent wait()'s return immediately.
+const Event = struct {
+    state: Atomic(u32) = Atomic(u32).init(EMPTY),
+
+    const EMPTY = 0;
+    const WAITING = 1;
+    const NOTIFIED = 2;
+    const SHUTDOWN = 3;
+
+    /// Wait for and consume a notification
+    /// or wait for the event to be shutdown entirely
+    noinline fn wait(self: *Event) void {
+        var acquire_with: u32 = EMPTY;
+        var state = self.state.load(.Monotonic);
+
+        while (true) {
+            // If we're shutdown then exit early.
+            // Acquire barrier to ensure operations before the shutdown() are seen after the wait().
+            // Shutdown is rare so it's better to have an Acquire barrier here instead of on CAS failure + load which are common.
+            if (state == SHUTDOWN) {
+                std.atomic.fence(.Acquire);
+                return;
+            }
+
+            // Consume a notification when it pops up.
+            // Acquire barrier to ensure operations before the notify() appear after the wait().
+            if (state == NOTIFIED) {
+                state = self.state.tryCompareAndSwap(
+                    state,
+                    acquire_with,
+                    .Acquire,
+                    .Monotonic,
+                ) orelse return;
+                continue;
+            }
+
+            // There is no notification to consume, we should wait on the event by ensuring its WAITING.
+            if (state != WAITING) blk: {
+                state = self.state.tryCompareAndSwap(
+                    state,
+                    WAITING,
+                    .Monotonic,
+                    .Monotonic,
+                ) orelse break :blk;
+                continue;
+            }
+
+            // Wait on the event until a notify() or shutdown().
+            // If we wake up to a notification, we must acquire it with WAITING instead of EMPTY
+            // since there may be other threads sleeping on the Futex who haven't been woken up yet.
+            //
+            // Acquiring to WAITING will make the next notify() or shutdown() wake a sleeping futex thread
+            // who will either exit on SHUTDOWN or acquire with WAITING again, ensuring all threads are awoken.
+            // This unfortunately results in the last notify() or shutdown() doing an extra futex wake but that's fine.
+            Futex.wait(&self.state, WAITING, null) catch unreachable;
+            state = self.state.load(.Monotonic);
+            acquire_with = WAITING;
+        }
+    }
+
+    /// Post a notification to the event if it doesn't have one already
+    /// then wake up a waiting thread if there is one as well.
+    fn notify(self: *Event) void {
+        return self.wake(NOTIFIED, 1);
+    }
+
+    /// Marks the event as shutdown, making all future wait()'s return immediately.
+    /// Then wakes up any threads currently waiting on the Event.
+    fn shutdown(self: *Event) void {
+        return self.wake(SHUTDOWN, std.math.maxInt(u32));
+    }
+
+    fn wake(self: *Event, release_with: u32, wake_threads: u32) void {
+        // Update the Event to notifty it with the new `release_with` state (either NOTIFIED or SHUTDOWN).
+        // Release barrier to ensure any operations before this are this to happen before the wait() in the other threads.
+        const state = self.state.swap(release_with, .Release);
+
+        // Only wake threads sleeping in futex if the state is WAITING.
+        // Avoids unnecessary wake ups.
+        if (state == WAITING) {
+            Futex.wake(&self.state, wake_threads);
+        }
+    }
+};
+
+/// Linked list intrusive memory node and lock-free data structures to operate with it
+const Node = struct {
+    next: ?*Node = null,
+
+    /// A linked list of Nodes
+    const List = struct {
+        head: *Node,
+        tail: *Node,
+    };
+
+    /// An unbounded multi-producer-(non blocking)-multi-consumer queue of Node pointers.
+    const Queue = struct {
+        stack: Atomic(usize) = Atomic(usize).init(0),
+        cache: ?*Node = null,
+
+        const HAS_CACHE: usize = 0b01;
+        const IS_CONSUMING: usize = 0b10;
+        const PTR_MASK: usize = ~(HAS_CACHE | IS_CONSUMING);
+
+        comptime {
+            assert(@alignOf(Node) >= ((IS_CONSUMING | HAS_CACHE) + 1));
+        }
+
+        fn push(noalias self: *Queue, list: List) void {
+            var stack = self.stack.load(.Monotonic);
+            while (true) {
+                // Attach the list to the stack (pt. 1)
+                list.tail.next = @intToPtr(?*Node, stack & PTR_MASK);
+
+                // Update the stack with the list (pt. 2).
+                // Don't change the HAS_CACHE and IS_CONSUMING bits of the consumer.
+                var new_stack = @ptrToInt(list.head);
+                assert(new_stack & ~PTR_MASK == 0);
+                new_stack |= (stack & ~PTR_MASK);
+
+                // Push to the stack with a release barrier for the consumer to see the proper list links.
+                stack = self.stack.tryCompareAndSwap(
+                    stack,
+                    new_stack,
+                    .Release,
+                    .Monotonic,
+                ) orelse break;
+            }
+        }
+
+        fn tryAcquireConsumer(self: *Queue) error{ Empty, Contended }!?*Node {
+            var stack = self.stack.load(.Monotonic);
+            while (true) {
+                if (stack & IS_CONSUMING != 0)
+                    return error.Contended; // The queue already has a consumer.
+                if (stack & (HAS_CACHE | PTR_MASK) == 0)
+                    return error.Empty; // The queue is empty when there's nothing cached and nothing in the stack.
+
+                // When we acquire the consumer, also consume the pushed stack if the cache is empty.
+                var new_stack = stack | HAS_CACHE | IS_CONSUMING;
+                if (stack & HAS_CACHE == 0) {
+                    assert(stack & PTR_MASK != 0);
+                    new_stack &= ~PTR_MASK;
+                }
+
+                // Acquire barrier on getting the consumer to see cache/Node updates done by previous consumers
+                // and to ensure our cache/Node updates in pop() happen after that of previous consumers.
+                stack = self.stack.tryCompareAndSwap(
+                    stack,
+                    new_stack,
+                    .Acquire,
+                    .Monotonic,
+                ) orelse return self.cache orelse @intToPtr(*Node, stack & PTR_MASK);
+            }
+        }
+
+        fn releaseConsumer(noalias self: *Queue, noalias consumer: ?*Node) void {
+            // Stop consuming and remove the HAS_CACHE bit as well if the consumer's cache is empty.
+            // When HAS_CACHE bit is zeroed, the next consumer will acquire the pushed stack nodes.
+            var remove = IS_CONSUMING;
+            if (consumer == null)
+                remove |= HAS_CACHE;
+
+            // Release the consumer with a release barrier to ensure cache/node accesses
+            // happen before the consumer was released and before the next consumer starts using the cache.
+            self.cache = consumer;
+            const stack = self.stack.fetchSub(remove, .Release);
+            assert(stack & remove != 0);
+        }
+
+        fn pop(noalias self: *Queue, noalias consumer_ref: *?*Node) ?*Node {
+            // Check the consumer cache (fast path)
+            if (consumer_ref.*) |node| {
+                consumer_ref.* = node.next;
+                return node;
+            }
+
+            // Load the stack to see if there was anything pushed that we could grab.
+            var stack = self.stack.load(.Monotonic);
+            assert(stack & IS_CONSUMING != 0);
+            if (stack & PTR_MASK == 0) {
+                return null;
+            }
+
+            // Nodes have been pushed to the stack, grab then with an Acquire barrier to see the Node links.
+            stack = self.stack.swap(HAS_CACHE | IS_CONSUMING, .Acquire);
+            assert(stack & IS_CONSUMING != 0);
+            assert(stack & PTR_MASK != 0);
+
+            const node = @intToPtr(*Node, stack & PTR_MASK);
+            consumer_ref.* = node.next;
+            return node;
+        }
+    };
+
+    /// A bounded single-producer, multi-consumer ring buffer for node pointers.
+    const Buffer = struct {
+        head: Atomic(Index) = Atomic(Index).init(0),
+        tail: Atomic(Index) = Atomic(Index).init(0),
+        array: [capacity]Atomic(*Node) = undefined,
+
+        const Index = u32;
+        const capacity = 256; // Appears to be a pretty good trade-off in space vs contended throughput
+        comptime {
+            assert(std.math.maxInt(Index) >= capacity);
+            assert(std.math.isPowerOfTwo(capacity));
+        }
+
+        fn push(noalias self: *Buffer, noalias list: *List) error{Overflow}!void {
+            var head = self.head.load(.Monotonic);
+            var tail = self.tail.loadUnchecked(); // we're the only thread that can change this
+
+            while (true) {
+                var size = tail -% head;
+                assert(size <= capacity);
+
+                // Push nodes from the list to the buffer if it's not empty..
+                if (size < capacity) {
+                    var nodes: ?*Node = list.head;
+                    while (size < capacity) : (size += 1) {
+                        const node = nodes orelse break;
+                        nodes = node.next;
+
+                        // Array written atomically with weakest ordering since it could be getting atomically read by steal().
+                        self.array[tail % capacity].store(node, .Unordered);
+                        tail +%= 1;
+                    }
+
+                    // Release barrier synchronizes with Acquire loads for steal()ers to see the array writes.
+                    self.tail.store(tail, .Release);
+
+                    // Update the list with the nodes we pushed to the buffer and try again if there's more.
+                    list.head = nodes orelse return;
+                    std.atomic.spinLoopHint();
+                    head = self.head.load(.Monotonic);
+                    continue;
+                }
+
+                // Try to steal/overflow half of the tasks in the buffer to make room for future push()es.
+                // Migrating half amortizes the cost of stealing while requiring future pops to still use the buffer.
+                // Acquire barrier to ensure the linked list creation after the steal only happens after we succesfully steal.
+                var migrate = size / 2;
+                head = self.head.tryCompareAndSwap(
+                    head,
+                    head +% migrate,
+                    .Acquire,
+                    .Monotonic,
+                ) orelse {
+                    // Link the migrated Nodes together
+                    const first = self.array[head % capacity].loadUnchecked();
+                    while (migrate > 0) : (migrate -= 1) {
+                        const prev = self.array[head % capacity].loadUnchecked();
+                        head +%= 1;
+                        prev.next = self.array[head % capacity].loadUnchecked();
+                    }
+
+                    // Append the list that was supposed to be pushed to the end of the migrated Nodes
+                    const last = self.array[(head -% 1) % capacity].loadUnchecked();
+                    last.next = list.head;
+                    list.tail.next = null;
+
+                    // Return the migrated nodes + the original list as overflowed
+                    list.head = first;
+                    return error.Overflow;
+                };
+            }
+        }
+
+        fn pop(self: *Buffer) ?*Node {
+            var head = self.head.load(.Monotonic);
+            var tail = self.tail.loadUnchecked(); // we're the only thread that can change this
+
+            while (true) {
+                // Quick sanity check and return null when not empty
+                var size = tail -% head;
+                assert(size <= capacity);
+                if (size == 0) {
+                    return null;
+                }
+
+                // Dequeue with an acquire barrier to ensure any writes done to the Node
+                // only happen after we succesfully claim it from the array.
+                head = self.head.tryCompareAndSwap(
+                    head,
+                    head +% 1,
+                    .Acquire,
+                    .Monotonic,
+                ) orelse return self.array[head % capacity].loadUnchecked();
+            }
+        }
+
+        const Stole = struct {
+            node: *Node,
+            pushed: bool,
+        };
+
+        fn consume(noalias self: *Buffer, noalias queue: *Queue) ?Stole {
+            var consumer = queue.tryAcquireConsumer() catch return null;
+            defer queue.releaseConsumer(consumer);
+
+            const head = self.head.load(.Monotonic);
+            const tail = self.tail.loadUnchecked(); // we're the only thread that can change this
+
+            const size = tail -% head;
+            assert(size <= capacity);
+            assert(size == 0); // we should only be consuming if our array is empty
+
+            // Pop nodes from the queue and push them to our array.
+            // Atomic stores to the array as steal() threads may be atomically reading from it.
+            var pushed: Index = 0;
+            while (pushed < capacity) : (pushed += 1) {
+                const node = queue.pop(&consumer) orelse break;
+                self.array[(tail +% pushed) % capacity].store(node, .Unordered);
+            }
+
+            // We will be returning one node that we stole from the queue.
+            // Get an extra, and if that's not possible, take one from our array.
+            const node = queue.pop(&consumer) orelse blk: {
+                if (pushed == 0) return null;
+                pushed -= 1;
+                break :blk self.array[(tail +% pushed) % capacity].loadUnchecked();
+            };
+
+            // Update the array tail with the nodes we pushed to it.
+            // Release barrier to synchronize with Acquire barrier in steal()'s to see the written array Nodes.
+            if (pushed > 0) self.tail.store(tail +% pushed, .Release);
+            return Stole{
+                .node = node,
+                .pushed = pushed > 0,
+            };
+        }
+
+        fn steal(noalias self: *Buffer, noalias buffer: *Buffer) ?Stole {
+            const head = self.head.load(.Monotonic);
+            const tail = self.tail.loadUnchecked(); // we're the only thread that can change this
+
+            const size = tail -% head;
+            assert(size <= capacity);
+            assert(size == 0); // we should only be stealing if our array is empty
+
+            while (true) : (std.atomic.spinLoopHint()) {
+                const buffer_head = buffer.head.load(.Acquire);
+                const buffer_tail = buffer.tail.load(.Acquire);
+
+                // Overly large size indicates the the tail was updated a lot after the head was loaded.
+                // Reload both and try again.
+                const buffer_size = buffer_tail -% buffer_head;
+                if (buffer_size > capacity) {
+                    continue;
+                }
+
+                // Try to steal half (divCeil) to amortize the cost of stealing from other threads.
+                const steal_size = buffer_size - (buffer_size / 2);
+                if (steal_size == 0) {
+                    return null;
+                }
+
+                // Copy the nodes we will steal from the target's array to our own.
+                // Atomically load from the target buffer array as it may be pushing and atomically storing to it.
+                // Atomic store to our array as other steal() threads may be atomically loading from it as above.
+                var i: Index = 0;
+                while (i < steal_size) : (i += 1) {
+                    const node = buffer.array[(buffer_head +% i) % capacity].load(.Unordered);
+                    self.array[(tail +% i) % capacity].store(node, .Unordered);
+                }
+
+                // Try to commit the steal from the target buffer using:
+                // - an Acquire barrier to ensure that we only interact with the stolen Nodes after the steal was committed.
+                // - a Release barrier to ensure that the Nodes are copied above prior to the committing of the steal
+                //   because if they're copied after the steal, the could be getting rewritten by the target's push().
+                _ = buffer.head.compareAndSwap(
+                    buffer_head,
+                    buffer_head +% steal_size,
+                    .AcqRel,
+                    .Monotonic,
+                ) orelse {
+                    // Pop one from the nodes we stole as we'll be returning it
+                    const pushed = steal_size - 1;
+                    const node = self.array[(tail +% pushed) % capacity].loadUnchecked();
+
+                    // Update the array tail with the nodes we pushed to it.
+                    // Release barrier to synchronize with Acquire barrier in steal()'s to see the written array Nodes.
+                    if (pushed > 0) self.tail.store(tail +% pushed, .Release);
+                    return Stole{
+                        .node = node,
+                        .pushed = pushed > 0,
+                    };
+                };
+            }
+        }
+    };
+};