diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt index a35c7c8834e..b8081e25a9b 100644 --- a/.cspell.dict/cpython.txt +++ b/.cspell.dict/cpython.txt @@ -43,6 +43,7 @@ CONVFUNC convparam copyslot cpucount +datastack defaultdict denom deopt @@ -118,6 +119,7 @@ mult multibytecodec nameobj nameop +ncells nconsts newargs newfree @@ -125,6 +127,7 @@ NEWLOCALS newsemlockobject nfrees nkwargs +nlocalsplus nkwelts Nondescriptor noninteger @@ -192,6 +195,7 @@ testconsole ticketer tmptype tok_oldval +tstate tvars typeobject typeparam diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 054e52ae81a..555336f059a 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -11,7 +11,7 @@ license.workspace = true [features] default = ["std"] std = [] -threading = ["parking_lot"] +threading = ["parking_lot", "std"] wasm_js = ["getrandom/wasm_js"] [dependencies] diff --git a/crates/common/src/lock.rs b/crates/common/src/lock.rs index f1d00627e90..af680010821 100644 --- a/crates/common/src/lock.rs +++ b/crates/common/src/lock.rs @@ -10,18 +10,28 @@ cfg_if::cfg_if! { if #[cfg(feature = "threading")] { pub use parking_lot::{RawMutex, RawRwLock, RawThreadId}; - pub use std::sync::{LazyLock, OnceLock as OnceCell}; + pub use std::sync::OnceLock as OnceCell; pub use core::cell::LazyCell; } else { mod cell_lock; pub use cell_lock::{RawCellMutex as RawMutex, RawCellRwLock as RawRwLock, SingleThreadId as RawThreadId}; pub use core::cell::{LazyCell, OnceCell}; + } +} - /// `core::cell::LazyCell` with `Sync` for use in `static` items. - /// SAFETY: Without threading, there can be no concurrent access. +// LazyLock: uses std::sync::LazyLock when std is available (even without +// threading, because Rust test runner uses parallel threads). +// Without std, uses a LazyCell wrapper (truly single-threaded only). +cfg_if::cfg_if! { + if #[cfg(any(feature = "threading", feature = "std"))] { + pub use std::sync::LazyLock; + } else { pub struct LazyLock T>(core::cell::LazyCell); - // SAFETY: Without threading, there can be no concurrent access. + // SAFETY: This branch is only active when both "std" and "threading" + // features are absent — i.e., truly single-threaded no_std environments + // (e.g., embedded or bare-metal WASM). Without std, the Rust runtime + // cannot spawn threads, so Sync is trivially satisfied. unsafe impl Sync for LazyLock {} impl T> LazyLock { diff --git a/crates/vm/src/builtins/frame.rs b/crates/vm/src/builtins/frame.rs index 04abe4772d7..4a8549239e7 100644 --- a/crates/vm/src/builtins/frame.rs +++ b/crates/vm/src/builtins/frame.rs @@ -692,7 +692,7 @@ impl Py { // Clear fastlocals // SAFETY: Frame is not executing (detached or stopped). { - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { self.fastlocals_mut() }; for slot in fastlocals.iter_mut() { *slot = None; } diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 316f739c9c9..1316dd7b725 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -236,7 +236,7 @@ impl PyFunction { // https://github.com/python/cpython/blob/main/Python/ceval.c#L3681 // SAFETY: Frame was just created and not yet executing. - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; let mut args_iter = func_args.args.into_iter(); @@ -562,6 +562,7 @@ impl Py { let is_gen = code.flags.contains(bytecode::CodeFlags::GENERATOR); let is_coro = code.flags.contains(bytecode::CodeFlags::COROUTINE); + let use_datastack = !(is_gen || is_coro); // Construct frame: let frame = Frame::new( @@ -570,6 +571,7 @@ impl Py { self.builtins.clone(), self.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(self.to_owned().into()), + use_datastack, vm, ) .into_ref(&vm.ctx); @@ -594,7 +596,16 @@ impl Py { frame.set_generator(&obj); Ok(obj) } - (false, false) => vm.run_frame(frame), + (false, false) => { + let result = vm.run_frame(frame.clone()); + // Release data stack memory after frame execution completes. + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + result + } } } @@ -665,13 +676,14 @@ impl Py { self.builtins.clone(), self.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(self.to_owned().into()), + true, // Always use datastack (invoke_exact_args is never gen/coro) vm, ) .into_ref(&vm.ctx); // Move args directly into fastlocals (no clone/refcount needed) { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..)) { *slot = Some(arg); } @@ -679,14 +691,20 @@ impl Py { // Handle cell2arg if let Some(cell2arg) = code.cell2arg.as_deref() { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { let x = fastlocals[*arg_idx as usize].take(); frame.set_cell_contents(cell_idx, x); } } - vm.run_frame(frame) + let result = vm.run_frame(frame.clone()); + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + result } } @@ -1291,26 +1309,33 @@ pub(crate) fn vectorcall_function( zelf.builtins.clone(), zelf.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(zelf.to_owned().into()), + true, // Always use datastack (is_simple excludes gen/coro) vm, ) .into_ref(&vm.ctx); { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..nargs)) { *slot = Some(arg); } } if let Some(cell2arg) = code.cell2arg.as_deref() { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { let x = fastlocals[*arg_idx as usize].take(); frame.set_cell_contents(cell_idx, x); } } - return vm.run_frame(frame); + let result = vm.run_frame(frame.clone()); + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + return result; } // SLOW PATH: construct FuncArgs from owned Vec and delegate to invoke() diff --git a/crates/vm/src/builtins/super.rs b/crates/vm/src/builtins/super.rs index 4fb81b6e591..01bdfa6749e 100644 --- a/crates/vm/src/builtins/super.rs +++ b/crates/vm/src/builtins/super.rs @@ -86,7 +86,7 @@ impl Initializer for PySuper { return Err(vm.new_runtime_error("super(): no arguments")); } // SAFETY: Frame is current and not concurrently mutated. - let obj = unsafe { frame.fastlocals.borrow() }[0] + let obj = unsafe { frame.fastlocals() }[0] .clone() .or_else(|| { if let Some(cell2arg) = frame.code.cell2arg.as_deref() { diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs new file mode 100644 index 00000000000..b00f1b6dc19 --- /dev/null +++ b/crates/vm/src/datastack.rs @@ -0,0 +1,256 @@ +/// Thread data stack for interpreter frames (`_PyStackChunk` / +/// `tstate->datastack_*`). +/// +/// A linked list of chunks providing bump allocation for frame-local data +/// (localsplus arrays). Normal function calls allocate via `push()` +/// (pointer bump). Generators and coroutines use heap-allocated storage. +use alloc::alloc::{alloc, dealloc}; +use core::alloc::Layout; +use core::ptr; + +/// Minimum chunk size in bytes (`_PY_DATA_STACK_CHUNK_SIZE`). +const MIN_CHUNK_SIZE: usize = 16 * 1024; + +/// Extra headroom (in bytes) to avoid allocating a new chunk for the next +/// frame right after growing. +const MINIMUM_OVERHEAD: usize = 1000 * core::mem::size_of::(); + +/// Alignment for all data stack allocations. +const ALIGN: usize = 16; + +/// Header for a data stack chunk. The usable data region starts right after +/// this header (aligned to `ALIGN`). +#[repr(C)] +struct DataStackChunk { + /// Previous chunk in the linked list (NULL for the root chunk). + previous: *mut DataStackChunk, + /// Total allocation size in bytes (including this header). + size: usize, + /// Saved `top` offset when a newer chunk was pushed. Used to restore + /// `DataStack::top` when popping back to this chunk. + saved_top: usize, +} + +impl DataStackChunk { + /// Pointer to the first usable byte after the header (aligned). + #[inline(always)] + fn data_start(&self) -> *mut u8 { + let header_end = (self as *const Self as usize) + core::mem::size_of::(); + let aligned = (header_end + ALIGN - 1) & !(ALIGN - 1); + aligned as *mut u8 + } + + /// Pointer past the last usable byte. + #[inline(always)] + fn data_limit(&self) -> *mut u8 { + unsafe { (self as *const Self as *mut u8).add(self.size) } + } +} + +/// Per-thread data stack for bump-allocating frame-local data. +pub struct DataStack { + /// Current chunk. + chunk: *mut DataStackChunk, + /// Current allocation position within the current chunk. + top: *mut u8, + /// End of usable space in the current chunk. + limit: *mut u8, +} + +impl DataStack { + /// Create a new data stack with an initial root chunk. + pub fn new() -> Self { + let chunk = Self::alloc_chunk(MIN_CHUNK_SIZE, ptr::null_mut()); + let top = unsafe { (*chunk).data_start() }; + let limit = unsafe { (*chunk).data_limit() }; + // Skip one ALIGN-sized slot in the root chunk so that `pop()` never + // frees it (`push_chunk` convention). + let top = unsafe { top.add(ALIGN) }; + Self { chunk, top, limit } + } + + /// Check if the current chunk has at least `size` bytes available. + #[inline(always)] + pub fn has_space(&self, size: usize) -> bool { + let aligned_size = (size + ALIGN - 1) & !(ALIGN - 1); + (self.limit as usize).saturating_sub(self.top as usize) >= aligned_size + } + + /// Allocate `size` bytes from the data stack. + /// + /// Returns a pointer to the allocated region (aligned to `ALIGN`). + /// The caller must call `pop()` with the returned pointer when done + /// (LIFO order). + #[inline(always)] + pub fn push(&mut self, size: usize) -> *mut u8 { + let aligned_size = (size + ALIGN - 1) & !(ALIGN - 1); + unsafe { + if self.top.add(aligned_size) <= self.limit { + let ptr = self.top; + self.top = self.top.add(aligned_size); + ptr + } else { + self.push_slow(aligned_size) + } + } + } + + /// Slow path: allocate a new chunk and push from it. + #[cold] + #[inline(never)] + fn push_slow(&mut self, aligned_size: usize) -> *mut u8 { + let mut chunk_size = MIN_CHUNK_SIZE; + let needed = aligned_size + .checked_add(MINIMUM_OVERHEAD) + .and_then(|v| v.checked_add(core::mem::size_of::())) + .and_then(|v| v.checked_add(ALIGN)) + .expect("DataStack chunk size overflow"); + while chunk_size < needed { + chunk_size = chunk_size + .checked_mul(2) + .expect("DataStack chunk size overflow"); + } + // Save current position in old chunk. + unsafe { + (*self.chunk).saved_top = self.top as usize - self.chunk as usize; + } + let new_chunk = Self::alloc_chunk(chunk_size, self.chunk); + self.chunk = new_chunk; + let start = unsafe { (*new_chunk).data_start() }; + self.limit = unsafe { (*new_chunk).data_limit() }; + self.top = unsafe { start.add(aligned_size) }; + start + } + + /// Pop a previous allocation. `base` must be the pointer returned by + /// `push()`. Calls must be in LIFO order. + /// + /// # Safety + /// `base` must be a valid pointer returned by `push()` on this data stack, + /// and all allocations made after it must already have been popped. + #[inline(always)] + pub unsafe fn pop(&mut self, base: *mut u8) { + debug_assert!(!base.is_null()); + if self.is_in_current_chunk(base) { + // Common case: base is within the current chunk. + self.top = base; + } else { + // base is in a previous chunk — free the current chunk. + unsafe { self.pop_slow(base) }; + } + } + + /// Check if `ptr` falls within the current chunk's data area. + /// Both bounds are checked to handle non-monotonic allocation addresses + /// (e.g. on Windows where newer chunks may be at lower addresses). + #[inline(always)] + fn is_in_current_chunk(&self, ptr: *mut u8) -> bool { + let chunk_start = unsafe { (*self.chunk).data_start() }; + ptr >= chunk_start && ptr <= self.limit + } + + /// Slow path: pop back to a previous chunk. + #[cold] + #[inline(never)] + unsafe fn pop_slow(&mut self, base: *mut u8) { + loop { + let old_chunk = self.chunk; + let prev = unsafe { (*old_chunk).previous }; + debug_assert!(!prev.is_null(), "tried to pop past the root chunk"); + unsafe { Self::free_chunk(old_chunk) }; + self.chunk = prev; + self.limit = unsafe { (*prev).data_limit() }; + if self.is_in_current_chunk(base) { + self.top = base; + return; + } + } + } + + /// Allocate a new chunk. + fn alloc_chunk(size: usize, previous: *mut DataStackChunk) -> *mut DataStackChunk { + let layout = Layout::from_size_align(size, ALIGN).expect("invalid chunk layout"); + let ptr = unsafe { alloc(layout) }; + if ptr.is_null() { + alloc::alloc::handle_alloc_error(layout); + } + let chunk = ptr as *mut DataStackChunk; + unsafe { + (*chunk).previous = previous; + (*chunk).size = size; + (*chunk).saved_top = 0; + } + chunk + } + + /// Free a chunk. + unsafe fn free_chunk(chunk: *mut DataStackChunk) { + let size = unsafe { (*chunk).size }; + let layout = Layout::from_size_align(size, ALIGN).expect("invalid chunk layout"); + unsafe { dealloc(chunk as *mut u8, layout) }; + } +} + +// SAFETY: DataStack is per-thread and not shared. The raw pointers +// it contains point to memory exclusively owned by this DataStack. +unsafe impl Send for DataStack {} + +impl Default for DataStack { + fn default() -> Self { + Self::new() + } +} + +impl Drop for DataStack { + fn drop(&mut self) { + let mut chunk = self.chunk; + while !chunk.is_null() { + let prev = unsafe { (*chunk).previous }; + unsafe { Self::free_chunk(chunk) }; + chunk = prev; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_push_pop() { + let mut ds = DataStack::new(); + let p1 = ds.push(64); + assert!(!p1.is_null()); + let p2 = ds.push(128); + assert!(!p2.is_null()); + assert!(p2 > p1); + unsafe { + ds.pop(p2); + ds.pop(p1); + } + } + + #[test] + fn cross_chunk_push_pop() { + let mut ds = DataStack::new(); + // Push enough to force a new chunk + let mut ptrs = Vec::new(); + for _ in 0..100 { + ptrs.push(ds.push(1024)); + } + // Pop all in reverse + for p in ptrs.into_iter().rev() { + unsafe { ds.pop(p) }; + } + } + + #[test] + fn alignment() { + let mut ds = DataStack::new(); + for size in [1, 7, 15, 16, 17, 31, 32, 33, 64, 100] { + let p = ds.push(size); + assert_eq!(p as usize % ALIGN, 0, "alignment violated for size {size}"); + unsafe { ds.pop(p) }; + } + } +} diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index c1a45fc10ce..29626d104da 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -46,7 +46,6 @@ use malachite_bigint::BigInt; use num_traits::Zero; use rustpython_common::atomic::{PyAtomic, Radium}; use rustpython_common::{ - boxvec::BoxVec, lock::{OnceCell, PyMutex}, wtf8::{Wtf8, Wtf8Buf, wtf8_concat}, }; @@ -67,19 +66,6 @@ enum UnwindReason { Raising { exception: PyBaseExceptionRef }, } -#[derive(Debug)] -struct FrameState { - // We need 1 stack per frame - /// The main data frame of the stack machine - stack: BoxVec>, - /// Cell and free variable references (cellvars + freevars). - cells_frees: Box<[PyCellRef]>, - /// Previous line number for LINE event suppression. - /// Stored here (not on ExecutingFrame) so it persists across - /// generator/coroutine suspend and resume. - prev_line: u32, -} - /// Tracks who owns a frame. // = `_PyFrameOwner` #[repr(i8)] @@ -104,53 +90,398 @@ impl FrameOwner { } } -/// Lock-free storage for local variables (localsplus). +/// Lock-free mutable storage for frame-internal data. /// /// # Safety -/// Mutable access is serialized by the frame's state mutex in `with_exec()`. -/// External readers (e.g. `f_locals`) must use `try_lock` on the state mutex: -/// if acquired, the frame is not executing and access is exclusive; if not, -/// the caller is on the same thread as `with_exec()` (trace callback) and -/// access is safe because frame execution is single-threaded. -pub struct FastLocals { - inner: UnsafeCell]>>, +/// Frame execution is single-threaded: only one thread at a time executes +/// a given frame (enforced by the owner field and generator running flag). +/// External readers (e.g. `f_locals`) are on the same thread as execution +/// (trace callback) or the frame is not executing. +struct FrameUnsafeCell(UnsafeCell); + +impl FrameUnsafeCell { + fn new(value: T) -> Self { + Self(UnsafeCell::new(value)) + } + + /// # Safety + /// Caller must ensure no concurrent mutable access. + #[inline(always)] + unsafe fn get(&self) -> *mut T { + self.0.get() + } +} + +// SAFETY: Frame execution is single-threaded. See FrameUnsafeCell doc. +#[cfg(feature = "threading")] +unsafe impl Send for FrameUnsafeCell {} +#[cfg(feature = "threading")] +unsafe impl Sync for FrameUnsafeCell {} + +/// Unified storage for local variables and evaluation stack. +/// +/// Memory layout (each slot is `usize`-sized): +/// `[0..nlocalsplus)` — fastlocals (`Option`) +/// `[nlocalsplus..nlocalsplus+stack_top)` — active evaluation stack (`Option`) +/// `[nlocalsplus+stack_top..capacity)` — unused stack capacity +/// +/// Both `Option` and `Option` are `usize`-sized +/// (niche optimization on NonNull / NonZeroUsize). The raw storage is +/// `usize` to unify them; typed access is provided through methods. +pub struct LocalsPlus { + /// Backing storage. + data: LocalsPlusData, + /// Number of fastlocals slots (nlocals + ncells + nfrees). + nlocalsplus: u32, + /// Current evaluation stack depth. + stack_top: u32, } -// SAFETY: Frame execution is serialized by the state mutex. +enum LocalsPlusData { + /// Heap-allocated storage (generators, coroutines, exec/eval frames). + Heap(Box<[usize]>), + /// Data stack allocated storage (normal function calls). + /// The pointer is valid while the enclosing data stack frame is alive. + DataStack { ptr: *mut usize, capacity: usize }, +} + +// SAFETY: DataStack variant points to thread-local DataStack memory. +// Frame execution is single-threaded (enforced by owner field). #[cfg(feature = "threading")] -unsafe impl Send for FastLocals {} +unsafe impl Send for LocalsPlusData {} #[cfg(feature = "threading")] -unsafe impl Sync for FastLocals {} +unsafe impl Sync for LocalsPlusData {} -impl FastLocals { - fn new(data: Box<[Option]>) -> Self { +const _: () = { + assert!(core::mem::size_of::>() == core::mem::size_of::()); + // PyStackRef size is checked in object/core.rs +}; + +impl LocalsPlus { + /// Create a new heap-backed LocalsPlus. All slots start as None (0). + fn new(nlocalsplus: usize, stacksize: usize) -> Self { + let capacity = nlocalsplus + .checked_add(stacksize) + .expect("LocalsPlus capacity overflow"); + let nlocalsplus_u32 = u32::try_from(nlocalsplus).expect("nlocalsplus exceeds u32"); Self { - inner: UnsafeCell::new(data), + data: LocalsPlusData::Heap(vec![0usize; capacity].into_boxed_slice()), + nlocalsplus: nlocalsplus_u32, + stack_top: 0, } } - /// # Safety - /// Caller must ensure exclusive access (frame state locked or frame - /// not executing). + /// Create a new LocalsPlus backed by the thread data stack. + /// All slots are zero-initialized. + /// + /// The caller must call `materialize_localsplus()` when the frame finishes + /// to migrate data to the heap, then `datastack_pop()` to free the memory. + fn new_on_datastack(nlocalsplus: usize, stacksize: usize, vm: &VirtualMachine) -> Self { + let capacity = nlocalsplus + .checked_add(stacksize) + .expect("LocalsPlus capacity overflow"); + let byte_size = capacity + .checked_mul(core::mem::size_of::()) + .expect("LocalsPlus byte size overflow"); + let nlocalsplus_u32 = u32::try_from(nlocalsplus).expect("nlocalsplus exceeds u32"); + let ptr = vm.datastack_push(byte_size) as *mut usize; + // Zero-initialize all slots (0 = None for both PyObjectRef and PyStackRef). + unsafe { core::ptr::write_bytes(ptr, 0, capacity) }; + Self { + data: LocalsPlusData::DataStack { ptr, capacity }, + nlocalsplus: nlocalsplus_u32, + stack_top: 0, + } + } + + /// Migrate data-stack-backed storage to the heap, preserving all values. + /// Returns the data stack base pointer for `DataStack::pop()`. + /// Returns `None` if already heap-backed. + fn materialize_to_heap(&mut self) -> Option<*mut u8> { + if let LocalsPlusData::DataStack { ptr, capacity } = &self.data { + let base = *ptr as *mut u8; + let heap_data = unsafe { core::slice::from_raw_parts(*ptr, *capacity) } + .to_vec() + .into_boxed_slice(); + self.data = LocalsPlusData::Heap(heap_data); + Some(base) + } else { + None + } + } + + /// Drop all contained values without freeing the backing storage. + fn drop_values(&mut self) { + self.stack_clear(); + let fastlocals = self.fastlocals_mut(); + for slot in fastlocals.iter_mut() { + let _ = slot.take(); + } + } + + // -- Data access helpers -- + #[inline(always)] - pub unsafe fn borrow(&self) -> &[Option] { - unsafe { &*self.inner.get() } + fn data_as_slice(&self) -> &[usize] { + match &self.data { + LocalsPlusData::Heap(b) => b, + LocalsPlusData::DataStack { ptr, capacity } => unsafe { + core::slice::from_raw_parts(*ptr, *capacity) + }, + } } - /// # Safety - /// Caller must ensure exclusive mutable access. #[inline(always)] - #[allow(clippy::mut_from_ref)] - pub unsafe fn borrow_mut(&self) -> &mut [Option] { - unsafe { &mut *self.inner.get() } + fn data_as_mut_slice(&mut self) -> &mut [usize] { + match &mut self.data { + LocalsPlusData::Heap(b) => b, + LocalsPlusData::DataStack { ptr, capacity } => unsafe { + core::slice::from_raw_parts_mut(*ptr, *capacity) + }, + } + } + + /// Total capacity (fastlocals + stack). + #[inline(always)] + fn capacity(&self) -> usize { + match &self.data { + LocalsPlusData::Heap(b) => b.len(), + LocalsPlusData::DataStack { capacity, .. } => *capacity, + } + } + + /// Stack capacity (max stack depth). + #[inline(always)] + fn stack_capacity(&self) -> usize { + self.capacity() - self.nlocalsplus as usize + } + + // -- Fastlocals access -- + + /// Immutable access to fastlocals as `Option` slice. + #[inline(always)] + fn fastlocals(&self) -> &[Option] { + let data = self.data_as_slice(); + let ptr = data.as_ptr() as *const Option; + unsafe { core::slice::from_raw_parts(ptr, self.nlocalsplus as usize) } + } + + /// Mutable access to fastlocals as `Option` slice. + #[inline(always)] + fn fastlocals_mut(&mut self) -> &mut [Option] { + let nlocalsplus = self.nlocalsplus as usize; + let data = self.data_as_mut_slice(); + let ptr = data.as_mut_ptr() as *mut Option; + unsafe { core::slice::from_raw_parts_mut(ptr, nlocalsplus) } + } + + // -- Stack access -- + + /// Current stack depth. + #[inline(always)] + fn stack_len(&self) -> usize { + self.stack_top as usize + } + + /// Whether the stack is empty. + #[inline(always)] + fn stack_is_empty(&self) -> bool { + self.stack_top == 0 + } + + /// Push a value onto the evaluation stack. + #[inline(always)] + fn stack_push(&mut self, val: Option) { + let idx = self.nlocalsplus as usize + self.stack_top as usize; + debug_assert!( + idx < self.capacity(), + "stack overflow: stack_top={}, capacity={}", + self.stack_top, + self.stack_capacity() + ); + let data = self.data_as_mut_slice(); + data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + self.stack_top += 1; + } + + /// Try to push; returns Err if stack is full. + #[inline(always)] + fn stack_try_push(&mut self, val: Option) -> Result<(), Option> { + let idx = self.nlocalsplus as usize + self.stack_top as usize; + if idx >= self.capacity() { + return Err(val); + } + let data = self.data_as_mut_slice(); + data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + self.stack_top += 1; + Ok(()) + } + + /// Pop a value from the evaluation stack. + #[inline(always)] + fn stack_pop(&mut self) -> Option { + debug_assert!(self.stack_top > 0, "stack underflow"); + self.stack_top -= 1; + let idx = self.nlocalsplus as usize + self.stack_top as usize; + let data = self.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); + unsafe { core::mem::transmute::>(raw) } + } + + /// Immutable view of the active stack as `Option` slice. + #[inline(always)] + fn stack_as_slice(&self) -> &[Option] { + let data = self.data_as_slice(); + let base = self.nlocalsplus as usize; + let ptr = unsafe { (data.as_ptr().add(base)) as *const Option }; + unsafe { core::slice::from_raw_parts(ptr, self.stack_top as usize) } + } + + /// Get a reference to a stack slot by index from the bottom. + #[inline(always)] + fn stack_index(&self, idx: usize) -> &Option { + debug_assert!(idx < self.stack_top as usize); + let data = self.data_as_slice(); + let raw_idx = self.nlocalsplus as usize + idx; + unsafe { &*(data.as_ptr().add(raw_idx) as *const Option) } + } + + /// Get a mutable reference to a stack slot by index from the bottom. + #[inline(always)] + fn stack_index_mut(&mut self, idx: usize) -> &mut Option { + debug_assert!(idx < self.stack_top as usize); + let raw_idx = self.nlocalsplus as usize + idx; + let data = self.data_as_mut_slice(); + unsafe { &mut *(data.as_mut_ptr().add(raw_idx) as *mut Option) } + } + + /// Get the last stack element (top of stack). + #[inline(always)] + fn stack_last(&self) -> Option<&Option> { + if self.stack_top == 0 { + None + } else { + Some(self.stack_index(self.stack_top as usize - 1)) + } + } + + /// Get mutable reference to the last stack element. + #[inline(always)] + fn stack_last_mut(&mut self) -> Option<&mut Option> { + if self.stack_top == 0 { + None + } else { + let idx = self.stack_top as usize - 1; + Some(self.stack_index_mut(idx)) + } + } + + /// Swap two stack elements. + #[inline(always)] + fn stack_swap(&mut self, a: usize, b: usize) { + let base = self.nlocalsplus as usize; + let data = self.data_as_mut_slice(); + data.swap(base + a, base + b); + } + + /// Truncate the stack to `new_len` elements, dropping excess values. + fn stack_truncate(&mut self, new_len: usize) { + debug_assert!(new_len <= self.stack_top as usize); + while self.stack_top as usize > new_len { + let _ = self.stack_pop(); + } + } + + /// Clear the stack, dropping all values. + fn stack_clear(&mut self) { + while self.stack_top > 0 { + let _ = self.stack_pop(); + } + } + + /// Drain stack elements from `from` to the end, returning an iterator + /// that yields `Option` in forward order and shrinks the stack. + fn stack_drain( + &mut self, + from: usize, + ) -> impl ExactSizeIterator> + '_ { + let end = self.stack_top as usize; + debug_assert!(from <= end); + // Reduce stack_top now; the drain iterator owns the elements. + self.stack_top = from as u32; + LocalsPlusStackDrain { + localsplus: self, + current: from, + end, + } + } + + /// Extend the stack with values from an iterator. + fn stack_extend(&mut self, iter: impl Iterator>) { + for val in iter { + self.stack_push(val); + } + } +} + +/// Iterator for draining stack elements in forward order. +struct LocalsPlusStackDrain<'a> { + localsplus: &'a mut LocalsPlus, + /// Current read position (stack-relative index). + current: usize, + /// End position (exclusive, stack-relative index). + end: usize, +} + +impl Iterator for LocalsPlusStackDrain<'_> { + type Item = Option; + + fn next(&mut self) -> Option { + if self.current >= self.end { + return None; + } + let idx = self.localsplus.nlocalsplus as usize + self.current; + let data = self.localsplus.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); + self.current += 1; + Some(unsafe { core::mem::transmute::>(raw) }) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.end - self.current; + (remaining, Some(remaining)) } } -unsafe impl Traverse for FastLocals { - fn traverse(&self, traverse_fn: &mut TraverseFn<'_>) { - // SAFETY: GC runs on the same thread; no concurrent mutation. - let data = unsafe { &*self.inner.get() }; - data.traverse(traverse_fn); +impl ExactSizeIterator for LocalsPlusStackDrain<'_> {} + +impl Drop for LocalsPlusStackDrain<'_> { + fn drop(&mut self) { + while self.current < self.end { + let idx = self.localsplus.nlocalsplus as usize + self.current; + let data = self.localsplus.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); + let _ = unsafe { core::mem::transmute::>(raw) }; + self.current += 1; + } + } +} + +impl Drop for LocalsPlus { + fn drop(&mut self) { + // drop_values handles both stack and fastlocals. + // For DataStack-backed storage, the caller should have called + // materialize_localsplus() + datastack_pop() before drop. + // If not (e.g. panic), the DataStack memory is leaked but + // values are still dropped safely. + self.drop_values(); + } +} + +unsafe impl Traverse for LocalsPlus { + fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { + self.fastlocals().traverse(tracer_fn); + self.stack_as_slice().traverse(tracer_fn); } } @@ -239,7 +570,8 @@ pub struct Frame { pub code: PyRef, pub func_obj: Option, - pub fastlocals: FastLocals, + /// Unified storage for local variables and evaluation stack. + localsplus: FrameUnsafeCell, pub locals: FrameLocals, pub globals: PyDictRef, pub builtins: PyObjectRef, @@ -248,7 +580,11 @@ pub struct Frame { pub lasti: PyAtomic, /// tracer function for this frame (usually is None) pub trace: PyMutex, - state: PyMutex, + + /// Cell and free variable references (cellvars + freevars). + cells_frees: FrameUnsafeCell>, + /// Previous line number for LINE event suppression. + prev_line: FrameUnsafeCell, // member pub trace_lines: PyMutex, @@ -284,25 +620,20 @@ impl PyPayload for Frame { } } -unsafe impl Traverse for FrameState { - fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { - self.stack.traverse(tracer_fn); - self.cells_frees.traverse(tracer_fn); - } -} - unsafe impl Traverse for Frame { fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { self.code.traverse(tracer_fn); self.func_obj.traverse(tracer_fn); - self.fastlocals.traverse(tracer_fn); + // SAFETY: GC traversal does not run concurrently with frame execution. + unsafe { + (*self.localsplus.get()).traverse(tracer_fn); + (*self.cells_frees.get()).traverse(tracer_fn); + } self.locals.traverse(tracer_fn); self.globals.traverse(tracer_fn); self.builtins.traverse(tracer_fn); self.trace.traverse(tracer_fn); - self.state.traverse(tracer_fn); self.temporary_refs.traverse(tracer_fn); - // generator is a borrowed reference, not traversed } } @@ -322,6 +653,7 @@ impl Frame { builtins: PyObjectRef, closure: &[PyCellRef], func_obj: Option, + use_datastack: bool, vm: &VirtualMachine, ) -> Self { let nlocals = code.varnames.len(); @@ -334,23 +666,24 @@ impl Frame { .chain(closure.iter().cloned()) .collect(); - // Extend fastlocals to include varnames + cellvars + freevars (localsplus) - let total_locals = nlocals + num_cells + nfrees; - let mut fastlocals_vec: Vec> = vec![None; total_locals]; + let nlocalsplus = nlocals + .checked_add(num_cells) + .and_then(|v| v.checked_add(nfrees)) + .expect("Frame::new: nlocalsplus overflow"); + let max_stackdepth = code.max_stackdepth as usize; + let mut localsplus = if use_datastack { + LocalsPlus::new_on_datastack(nlocalsplus, max_stackdepth, vm) + } else { + LocalsPlus::new(nlocalsplus, max_stackdepth) + }; // Store cell objects at cellvars and freevars positions for (i, cell) in cells_frees.iter().enumerate() { - fastlocals_vec[nlocals + i] = Some(cell.clone().into()); + localsplus.fastlocals_mut()[nlocals + i] = Some(cell.clone().into()); } - let state = FrameState { - stack: BoxVec::new(code.max_stackdepth as usize), - cells_frees, - prev_line: 0, - }; - Self { - fastlocals: FastLocals::new(fastlocals_vec.into_boxed_slice()), + localsplus: FrameUnsafeCell::new(localsplus), locals: match scope.locals { Some(locals) => FrameLocals::with_locals(locals), None if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) => FrameLocals::lazy(), @@ -363,7 +696,8 @@ impl Frame { code, func_obj, lasti: Radium::new(0), - state: PyMutex::new(state), + cells_frees: FrameUnsafeCell::new(cells_frees), + prev_line: FrameUnsafeCell::new(0), trace: PyMutex::new(vm.ctx.none()), trace_lines: PyMutex::new(true), trace_opcodes: PyMutex::new(false), @@ -377,12 +711,45 @@ impl Frame { } } + /// Access fastlocals immutably. + /// + /// # Safety + /// Caller must ensure no concurrent mutable access (frame not executing, + /// or called from the same thread during trace callback). + #[inline(always)] + pub unsafe fn fastlocals(&self) -> &[Option] { + unsafe { (*self.localsplus.get()).fastlocals() } + } + + /// Access fastlocals mutably. + /// + /// # Safety + /// Caller must ensure exclusive access (frame not executing). + #[inline(always)] + #[allow(clippy::mut_from_ref)] + pub unsafe fn fastlocals_mut(&self) -> &mut [Option] { + unsafe { (*self.localsplus.get()).fastlocals_mut() } + } + + /// Migrate data-stack-backed storage to the heap, preserving all values, + /// and return the data stack base pointer for `DataStack::pop()`. + /// Returns `None` if already heap-backed. + /// + /// # Safety + /// Caller must ensure the frame is not executing and the returned + /// pointer is passed to `VirtualMachine::datastack_pop()`. + pub(crate) unsafe fn materialize_localsplus(&self) -> Option<*mut u8> { + unsafe { (*self.localsplus.get()).materialize_to_heap() } + } + /// Clear evaluation stack and state-owned cell/free references. /// For full local/cell cleanup, call `clear_locals_and_stack()`. pub(crate) fn clear_stack_and_cells(&self) { - let mut state = self.state.lock(); - state.stack.clear(); - let _old = core::mem::take(&mut state.cells_frees); + // SAFETY: Called when frame is not executing (generator closed). + unsafe { + (*self.localsplus.get()).stack_clear(); + let _old = core::mem::take(&mut *self.cells_frees.get()); + } } /// Clear locals and stack after generator/coroutine close. @@ -390,7 +757,7 @@ impl Frame { pub(crate) fn clear_locals_and_stack(&self) { self.clear_stack_and_cells(); // SAFETY: Frame is not executing (generator closed). - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals_mut() }; for slot in fastlocals.iter_mut() { *slot = None; } @@ -400,7 +767,7 @@ impl Frame { pub(crate) fn get_cell_contents(&self, cell_idx: usize) -> Option { let nlocals = self.code.varnames.len(); // SAFETY: Frame not executing; no concurrent mutation. - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals() }; fastlocals .get(nlocals + cell_idx) .and_then(|slot| slot.as_ref()) @@ -410,7 +777,8 @@ impl Frame { /// Set cell contents by cell index. Only safe to call before frame execution starts. pub(crate) fn set_cell_contents(&self, cell_idx: usize, value: Option) { - self.state.lock().cells_frees[cell_idx].set(value); + // SAFETY: Called before frame execution starts. + unsafe { (*self.cells_frees.get())[cell_idx].set(value) }; } /// Store a borrowed back-reference to the owning generator/coroutine. @@ -469,7 +837,7 @@ impl Frame { } let code = &**self.code; // SAFETY: Called before generator resume; no concurrent access. - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals_mut() }; let locals_map = self.locals.mapping(vm); for (i, &varname) in code.varnames.iter().enumerate() { if i >= fastlocals.len() { @@ -486,19 +854,15 @@ impl Frame { } pub fn locals(&self, vm: &VirtualMachine) -> PyResult { - // Acquire the state mutex to synchronize with frame execution. - // If try_lock fails, the frame is executing on this thread (e.g. - // trace callback accessing f_locals), so fastlocals access is safe. - let _guard = self.state.try_lock(); + // SAFETY: Either the frame is not executing (caller checked owner), + // or we're in a trace callback on the same thread that's executing. let locals = &self.locals; let code = &**self.code; let map = &code.varnames; let j = core::cmp::min(map.len(), code.varnames.len()); let locals_map = locals.mapping(vm); if !code.varnames.is_empty() { - // SAFETY: Either _guard holds the state mutex (frame not executing), - // or we're in a trace callback on the same thread that holds it. - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals() }; for (&k, v) in zip(&map[..j], fastlocals) { match locals_map.ass_subscript(k, v.clone(), vm) { Ok(()) => {} @@ -534,10 +898,12 @@ impl Frame { impl Py { #[inline(always)] fn with_exec(&self, vm: &VirtualMachine, f: impl FnOnce(ExecutingFrame<'_>) -> R) -> R { - let mut state = self.state.lock(); + // SAFETY: Frame execution is single-threaded. Only one thread at a time + // executes a given frame (enforced by the owner field and generator + // running flag). Same safety argument as FastLocals (UnsafeCell). let exec = ExecutingFrame { code: &self.code, - fastlocals: &self.fastlocals, + localsplus: unsafe { &mut *self.localsplus.get() }, locals: &self.locals, globals: &self.globals, builtins: &self.builtins, @@ -551,7 +917,8 @@ impl Py { }, lasti: &self.lasti, object: self, - state: &mut state, + cells_frees: unsafe { &mut *self.cells_frees.get() }, + prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, }; f(exec) @@ -586,19 +953,24 @@ impl Py { } pub fn yield_from_target(&self) -> Option { - // Use try_lock to avoid deadlock when the frame is currently executing. - // A running coroutine has no yield-from target. - let mut state = self.state.try_lock()?; + // If the frame is currently executing (owned by thread), it has no + // yield-from target to report. + let owner = FrameOwner::from_i8(self.owner.load(atomic::Ordering::Acquire)); + if owner == FrameOwner::Thread { + return None; + } + // SAFETY: Frame is not executing, so UnsafeCell access is safe. let exec = ExecutingFrame { code: &self.code, - fastlocals: &self.fastlocals, + localsplus: unsafe { &mut *self.localsplus.get() }, locals: &self.locals, globals: &self.globals, builtins: &self.builtins, builtins_dict: None, lasti: &self.lasti, object: self, - state: &mut state, + cells_frees: unsafe { &mut *self.cells_frees.get() }, + prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, }; exec.yield_from_target().map(PyObject::to_owned) @@ -623,11 +995,11 @@ impl Py { } } -/// An executing frame; essentially just a struct to combine the immutable data outside the mutex -/// with the mutable data inside +/// An executing frame; borrows mutable frame-internal data for the duration +/// of bytecode execution. struct ExecutingFrame<'a> { code: &'a PyRef, - fastlocals: &'a FastLocals, + localsplus: &'a mut LocalsPlus, locals: &'a FrameLocals, globals: &'a PyDictRef, builtins: &'a PyObjectRef, @@ -638,7 +1010,8 @@ struct ExecutingFrame<'a> { builtins_dict: Option<&'a PyExact>, object: &'a Py, lasti: &'a PyAtomic, - state: &'a mut FrameState, + cells_frees: &'a mut Box<[PyCellRef]>, + prev_line: &'a mut u32, /// Cached monitoring events mask. Reloaded at Resume instruction only, monitoring_mask: u32, } @@ -647,8 +1020,7 @@ impl fmt::Debug for ExecutingFrame<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ExecutingFrame") .field("code", self.code) - // .field("scope", self.scope) - .field("state", self.state) + .field("stack_len", &self.localsplus.stack_len()) .finish() } } @@ -733,9 +1105,9 @@ impl ExecutingFrame<'_> { Instruction::Resume { .. } | Instruction::InstrumentedResume ) && let Some((loc, _)) = self.code.locations.get(idx) - && loc.line.get() as u32 != self.state.prev_line + && loc.line.get() as u32 != *self.prev_line { - self.state.prev_line = loc.line.get() as u32; + *self.prev_line = loc.line.get() as u32; vm.trace_event(crate::protocol::TraceEvent::Line, None)?; // Trace callback may have changed lasti via set_f_lineno. // Re-read and restart the loop from the new position. @@ -767,7 +1139,7 @@ impl ExecutingFrame<'_> { | Instruction::InstrumentedLine ) && let Some((loc, _)) = self.code.locations.get(idx) { - self.state.prev_line = loc.line.get() as u32; + *self.prev_line = loc.line.get() as u32; } // Fire 'opcode' trace event for sys.settrace when f_trace_opcodes @@ -987,7 +1359,7 @@ impl ExecutingFrame<'_> { // 3. Stack top is the delegate (receiver) // // First check if stack is empty - if so, we can't be in yield-from - if self.state.stack.is_empty() { + if self.localsplus.stack_is_empty() { return None; } let lasti = self.lasti() as usize; @@ -1032,7 +1404,7 @@ impl ExecutingFrame<'_> { // In CPython, _Py_call_instrumentation_line has a special case // for RESUME: it fires LINE even when prev_line == current_line. // Since gen_throw bypasses RESUME, we reset prev_line instead. - self.state.prev_line = 0; + *self.prev_line = 0; if let Some(jen) = self.yield_from_target() { // Check if the exception is GeneratorExit (type or instance). // For GeneratorExit, close the sub-iterator instead of throwing. @@ -1453,9 +1825,9 @@ impl ExecutingFrame<'_> { // CopyItem { index: 2 } copies second from top // This is 1-indexed to match CPython let idx = index.get(arg) as usize; - let stack_len = self.state.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!(stack_len >= idx, "CopyItem: stack underflow"); - let value = self.state.stack[stack_len - idx].clone(); + let value = self.localsplus.stack_index(stack_len - idx).clone(); self.push_stackref_opt(value); Ok(None) } @@ -1465,11 +1837,11 @@ impl ExecutingFrame<'_> { } Instruction::DeleteAttr { namei: idx } => self.delete_attr(vm, idx.get(arg)), Instruction::DeleteDeref { i } => { - self.state.cells_frees[i.get(arg) as usize].set(None); + self.cells_frees[i.get(arg) as usize].set(None); Ok(None) } Instruction::DeleteFast { var_num: idx } => { - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); let idx = idx.get(arg) as usize; if fastlocals[idx].is_none() { return Err(vm.new_exception_msg( @@ -1645,7 +2017,7 @@ impl ExecutingFrame<'_> { } Instruction::GetANext => { #[cfg(debug_assertions)] // remove when GetANext is fully implemented - let orig_stack_len = self.state.stack.len(); + let orig_stack_len = self.localsplus.stack_len(); let aiter = self.top_value(); let awaitable = if aiter.class().is(vm.ctx.types.async_generator) { @@ -1685,7 +2057,7 @@ impl ExecutingFrame<'_> { }; self.push_value(awaitable); #[cfg(debug_assertions)] - debug_assert_eq!(orig_stack_len + 1, self.state.stack.len()); + debug_assert_eq!(orig_stack_len + 1, self.localsplus.stack_len()); Ok(None) } Instruction::GetAwaitable { r#where: oparg } => { @@ -1900,7 +2272,7 @@ impl ExecutingFrame<'_> { }; self.push_value(match value { Some(v) => v, - None => self.state.cells_frees[i] + None => self.cells_frees[i] .get() .ok_or_else(|| self.unbound_cell_exception(i, vm))?, }); @@ -1957,7 +2329,7 @@ impl ExecutingFrame<'_> { } Instruction::LoadDeref { i } => { let idx = i.get(arg) as usize; - let x = self.state.cells_frees[idx] + let x = self.cells_frees[idx] .get() .ok_or_else(|| self.unbound_cell_exception(idx, vm))?; self.push_value(x); @@ -1975,7 +2347,7 @@ impl ExecutingFrame<'_> { ) } let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] + let x = self.localsplus.fastlocals()[idx] .clone() .ok_or_else(|| reference_error(self.code.varnames[idx], vm))?; self.push_value(x); @@ -1985,7 +2357,7 @@ impl ExecutingFrame<'_> { // Load value and clear the slot (for inlined comprehensions) // If slot is empty, push None (not an error - variable may not exist yet) let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow_mut() }[idx] + let x = self.localsplus.fastlocals_mut()[idx] .take() .unwrap_or_else(|| vm.ctx.none()); self.push_value(x); @@ -1995,18 +2367,16 @@ impl ExecutingFrame<'_> { // Same as LoadFast but explicitly checks for unbound locals // (LoadFast in RustPython already does this check) let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] - .clone() - .ok_or_else(|| { - vm.new_exception_msg( - vm.ctx.exceptions.unbound_local_error.to_owned(), - format!( - "local variable '{}' referenced before assignment", - self.code.varnames[idx] - ) - .into(), + let x = self.localsplus.fastlocals()[idx].clone().ok_or_else(|| { + vm.new_exception_msg( + vm.ctx.exceptions.unbound_local_error.to_owned(), + format!( + "local variable '{}' referenced before assignment", + self.code.varnames[idx] ) - })?; + .into(), + ) + })?; self.push_value(x); Ok(None) } @@ -2016,7 +2386,7 @@ impl ExecutingFrame<'_> { let oparg = packed.get(arg); let idx1 = (oparg >> 4) as usize; let idx2 = (oparg & 15) as usize; - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = self.localsplus.fastlocals(); let x1 = fastlocals[idx1].clone().ok_or_else(|| { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), @@ -2046,18 +2416,16 @@ impl ExecutingFrame<'_> { // lifetime issues at yield/exception points are resolved. Instruction::LoadFastBorrow { var_num: idx } => { let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] - .clone() - .ok_or_else(|| { - vm.new_exception_msg( - vm.ctx.exceptions.unbound_local_error.to_owned(), - format!( - "local variable '{}' referenced before assignment", - self.code.varnames[idx] - ) - .into(), + let x = self.localsplus.fastlocals()[idx].clone().ok_or_else(|| { + vm.new_exception_msg( + vm.ctx.exceptions.unbound_local_error.to_owned(), + format!( + "local variable '{}' referenced before assignment", + self.code.varnames[idx] ) - })?; + .into(), + ) + })?; self.push_value(x); Ok(None) } @@ -2065,7 +2433,7 @@ impl ExecutingFrame<'_> { let oparg = packed.get(arg); let idx1 = (oparg >> 4) as usize; let idx2 = (oparg & 15) as usize; - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = self.localsplus.fastlocals(); let x1 = fastlocals[idx1].clone().ok_or_else(|| { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), @@ -2574,18 +2942,18 @@ impl ExecutingFrame<'_> { } Instruction::StoreDeref { i } => { let value = self.pop_value(); - self.state.cells_frees[i.get(arg) as usize].set(Some(value)); + self.cells_frees[i.get(arg) as usize].set(Some(value)); Ok(None) } Instruction::StoreFast { var_num: idx } => { let value = self.pop_value(); - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); fastlocals[idx.get(arg) as usize] = Some(value); Ok(None) } Instruction::StoreFastLoadFast { var_nums } => { let value = self.pop_value(); - let locals = unsafe { self.fastlocals.borrow_mut() }; + let locals = self.localsplus.fastlocals_mut(); let oparg = var_nums.get(arg); locals[oparg.store_idx() as usize] = Some(value); let load_value = locals[oparg.load_idx() as usize] @@ -2600,7 +2968,7 @@ impl ExecutingFrame<'_> { let idx2 = (oparg & 15) as usize; let value1 = self.pop_value(); let value2 = self.pop_value(); - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); fastlocals[idx1] = Some(value1); fastlocals[idx2] = Some(value2); Ok(None) @@ -2640,7 +3008,7 @@ impl ExecutingFrame<'_> { self.execute_store_subscript(vm) } Instruction::Swap { i: index } => { - let len = self.state.stack.len(); + let len = self.localsplus.stack_len(); debug_assert!(len > 0, "stack underflow in SWAP"); let i = len - 1; // TOS index let index_val = index.get(arg) as usize; @@ -2653,7 +3021,7 @@ impl ExecutingFrame<'_> { len ); let j = len - index_val; - self.state.stack.swap(i, j); + self.localsplus.stack_swap(i, j); Ok(None) } Instruction::ToBool => { @@ -2677,9 +3045,9 @@ impl ExecutingFrame<'_> { // __exit__ is at TOS-3 (below lasti, prev_exc, and exc) let exc = vm.current_exception(); - let stack_len = self.state.stack.len(); + let stack_len = self.localsplus.stack_len(); let exit = expect_unchecked( - self.state.stack[stack_len - 4].clone(), + self.localsplus.stack_index(stack_len - 4).clone(), "WithExceptStart: __exit__ is NULL", ); @@ -2696,8 +3064,8 @@ impl ExecutingFrame<'_> { } Instruction::YieldValue { arg: oparg } => { debug_assert!( - self.state - .stack + self.localsplus + .stack_as_slice() .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -3820,9 +4188,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 0 { // Stack: [callable, self_or_null] — peek to get func ptr - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - 1).is_some(); let callable = self.nth_value(1); let callable_tag = callable as *const PyObject as u32; let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -3856,9 +4223,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 1 { // Stack: [callable, self_or_null, arg1] - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - 2].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - 2).is_some(); let callable = self.nth_value(2); let callable_tag = callable as *const PyObject as u32; let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -3893,9 +4259,11 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -3944,9 +4312,11 @@ impl ExecutingFrame<'_> { let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); if !self_or_null_is_some && cached_version != 0 && let Some(cls) = callable.downcast_ref::() @@ -4006,9 +4376,11 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -4833,8 +5205,8 @@ impl ExecutingFrame<'_> { } Instruction::InstrumentedYieldValue => { debug_assert!( - self.state - .stack + self.localsplus + .stack_as_slice() .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -5099,8 +5471,8 @@ impl ExecutingFrame<'_> { // Fire LINE event only if line changed if let Some((loc, _)) = self.code.locations.get(idx) { let line = loc.line.get() as u32; - if line != self.state.prev_line && line > 0 { - self.state.prev_line = line; + if line != *self.prev_line && line > 0 { + *self.prev_line = line; monitoring::fire_line(vm, self.code, offset, line)?; } } @@ -5394,8 +5766,8 @@ impl ExecutingFrame<'_> { } // 1. Pop stack to entry.depth - while self.state.stack.len() > entry.depth as usize { - self.state.stack.pop(); + while self.localsplus.stack_len() > entry.depth as usize { + let _ = self.localsplus.stack_pop(); } // 2. If push_lasti=true (SETUP_CLEANUP), push lasti before exception @@ -5599,7 +5971,7 @@ impl ExecutingFrame<'_> { #[inline] fn execute_call_vectorcall(&mut self, nargs: u32, vm: &VirtualMachine) -> FrameResult { let nargs_usize = nargs as usize; - let stack_len = self.state.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -5609,7 +5981,9 @@ impl ExecutingFrame<'_> { let args_start = stack_len - nargs_usize; // Build args: [self?, arg1, ..., argN] - let self_or_null = self.state.stack[self_or_null_idx] + let self_or_null = self + .localsplus + .stack_index_mut(self_or_null_idx) .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -5624,12 +5998,22 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.state.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self + .localsplus + .stack_index_mut(stack_idx) + .take() + .unwrap() + .to_pyobj(); args_vec.push(val); } - let callable_obj = self.state.stack[callable_idx].take().unwrap().to_pyobj(); - self.state.stack.truncate(callable_idx); + let callable_obj = self + .localsplus + .stack_index_mut(callable_idx) + .take() + .unwrap() + .to_pyobj(); + self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let result = callable_obj.vectorcall(args_vec, effective_nargs, None, vm)?; @@ -5650,7 +6034,7 @@ impl ExecutingFrame<'_> { let kw_count = kwarg_names_tuple.len(); debug_assert!(kw_count <= nargs_usize, "CALL_KW kw_count exceeds nargs"); - let stack_len = self.state.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL_KW stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -5660,7 +6044,9 @@ impl ExecutingFrame<'_> { let args_start = stack_len - nargs_usize; // Build args: [self?, pos_arg1, ..., pos_argM, kw_val1, ..., kw_valK] - let self_or_null = self.state.stack[self_or_null_idx] + let self_or_null = self + .localsplus + .stack_index_mut(self_or_null_idx) .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -5677,12 +6063,22 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.state.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self + .localsplus + .stack_index_mut(stack_idx) + .take() + .unwrap() + .to_pyobj(); args_vec.push(val); } - let callable_obj = self.state.stack[callable_idx].take().unwrap().to_pyobj(); - self.state.stack.truncate(callable_idx); + let callable_obj = self + .localsplus + .stack_index_mut(callable_idx) + .take() + .unwrap() + .to_pyobj(); + self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let kwnames = kwarg_names_tuple.as_slice(); @@ -5870,7 +6266,7 @@ impl ExecutingFrame<'_> { let mut elements = elements; // Elements on stack from right-to-left: - self.state.stack.extend( + self.localsplus.stack_extend( elements .drain(before + middle..) .rev() @@ -5882,7 +6278,7 @@ impl ExecutingFrame<'_> { self.push_value(t.into()); // Lastly the first reversed values: - self.state.stack.extend( + self.localsplus.stack_extend( elements .into_iter() .rev() @@ -6214,7 +6610,7 @@ impl ExecutingFrame<'_> { Err(vm.new_value_error(msg)) } PyIterReturn::StopIteration(_) => { - self.state.stack.extend( + self.localsplus.stack_extend( elements .into_iter() .rev() @@ -6863,9 +7259,11 @@ impl ExecutingFrame<'_> { // Stack: [callable, self_or_null, arg1, ..., argN] // callable is at position nargs + 1 from top // self_or_null is at position nargs from top - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let callable = self.nth_value(nargs + 1); if let Some(func) = callable.downcast_ref::() { @@ -7035,9 +7433,11 @@ impl ExecutingFrame<'_> { } // Stack: [callable, self_or_null, arg1, ..., argN, kwarg_names] // callable is at position nargs + 2 from top - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 2].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 2) + .is_some(); let callable = self.nth_value(nargs + 2); if let Some(func) = callable.downcast_ref::() { @@ -7566,7 +7966,7 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn push_stackref_opt(&mut self, obj: Option) { - match self.state.stack.try_push(obj) { + match self.localsplus.stack_try_push(obj) { Ok(()) => {} Err(_e) => self.fatal("tried to push value onto stack but overflowed max_stackdepth"), } @@ -7604,10 +8004,10 @@ impl ExecutingFrame<'_> { /// Pop a raw stackref from the stack, returning None if the stack slot is NULL. #[inline] fn pop_stackref_opt(&mut self) -> Option { - match self.state.stack.pop() { - Some(slot) => slot, - None => self.fatal("tried to pop from empty stack"), + if self.localsplus.stack_is_empty() { + self.fatal("tried to pop from empty stack"); } + self.localsplus.stack_pop() } /// Pop a raw stackref from the stack. Panics if NULL. @@ -7784,7 +8184,7 @@ impl ExecutingFrame<'_> { /// Pop multiple values from the stack. Panics if any slot is NULL. fn pop_multiple(&mut self, count: usize) -> impl ExactSizeIterator + '_ { - let stack_len = self.state.stack.len(); + let stack_len = self.localsplus.stack_len(); if count > stack_len { let instr = self.code.instructions.get(self.lasti() as usize); let op_name = instr @@ -7800,7 +8200,7 @@ impl ExecutingFrame<'_> { self.code.source_path() ); } - self.state.stack.drain(stack_len - count..).map(|obj| { + self.localsplus.stack_drain(stack_len - count).map(|obj| { expect_unchecked(obj, "pop_multiple but null found. This is a compiler bug.").to_pyobj() }) } @@ -7808,7 +8208,7 @@ impl ExecutingFrame<'_> { #[inline] fn replace_top(&mut self, top: Option) -> Option { let mut slot = top.map(PyStackRef::new_owned); - let last = self.state.stack.last_mut().unwrap(); + let last = self.localsplus.stack_last_mut().unwrap(); core::mem::swap(last, &mut slot); slot.map(|sr| sr.to_pyobj()) } @@ -7816,18 +8216,18 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn top_value(&self) -> &PyObject { - match &*self.state.stack { - [.., Some(last)] => last.as_object(), - [.., None] => self.fatal("tried to get top of stack but got NULL"), - [] => self.fatal("tried to get top of stack but stack is empty"), + match self.localsplus.stack_last() { + Some(Some(last)) => last.as_object(), + Some(None) => self.fatal("tried to get top of stack but got NULL"), + None => self.fatal("tried to get top of stack but stack is empty"), } } #[inline] #[track_caller] fn nth_value(&self, depth: u32) -> &PyObject { - let stack = &self.state.stack; - match &stack[stack.len() - depth as usize - 1] { + let idx = self.localsplus.stack_len() - depth as usize - 1; + match self.localsplus.stack_index(idx) { Some(obj) => obj.as_object(), None => unsafe { core::hint::unreachable_unchecked() }, } @@ -7844,21 +8244,26 @@ impl ExecutingFrame<'_> { impl fmt::Debug for Frame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let state = self.state.lock(); - let stack_str = state.stack.iter().fold(String::new(), |mut s, slot| { - match slot { - Some(elem) if elem.downcastable::() => { - s.push_str("\n > {frame}"); - } - Some(elem) => { - core::fmt::write(&mut s, format_args!("\n > {elem:?}")).unwrap(); - } - None => { - s.push_str("\n > NULL"); + // SAFETY: Debug is best-effort; concurrent mutation is unlikely + // and would only affect debug output. + let localsplus = unsafe { &*self.localsplus.get() }; + let stack_str = localsplus + .stack_as_slice() + .iter() + .fold(String::new(), |mut s, slot| { + match slot { + Some(elem) if elem.downcastable::() => { + s.push_str("\n > {frame}"); + } + Some(elem) => { + core::fmt::write(&mut s, format_args!("\n > {elem:?}")).unwrap(); + } + None => { + s.push_str("\n > NULL"); + } } - } - s - }); + s + }); // TODO: fix this up write!( f, diff --git a/crates/vm/src/gc_state.rs b/crates/vm/src/gc_state.rs index 28cf09f5ff2..a25c8d6c42a 100644 --- a/crates/vm/src/gc_state.rs +++ b/crates/vm/src/gc_state.rs @@ -134,10 +134,13 @@ pub struct GcState { finalized_objects: PyRwLock>, } -// SAFETY: All fields are either inherently Send/Sync (atomics, RwLock, Mutex) or protected by PyMutex. -// PyMutex> is safe to share/send across threads because access is synchronized. -// PyObjectRef itself is Send, and interior mutability is guarded by the mutex. +// SAFETY: GcObjectPtr wraps NonNull which is !Send/!Sync, but we only use it as an opaque +// hash key. PyObjectRef is Send. In threading mode, PyRwLock/PyMutex are parking_lot based +// and genuinely Sync. In non-threading mode, gc_state() is thread-local so neither Send +// nor Sync is needed. +#[cfg(feature = "threading")] unsafe impl Send for GcState {} +#[cfg(feature = "threading")] unsafe impl Sync for GcState {} impl Default for GcState { @@ -827,14 +830,15 @@ impl GcState { } } -use std::sync::OnceLock; - -/// Global GC state instance -/// Using a static because GC needs to be accessible from object allocation/deallocation -static GC_STATE: OnceLock = OnceLock::new(); - -/// Get a reference to the global GC state +/// Get a reference to the GC state. +/// +/// In threading mode this is a true global (OnceLock). +/// In non-threading mode this is thread-local, because PyRwLock/PyMutex +/// use Cell-based locks that are not Sync. pub fn gc_state() -> &'static GcState { + rustpython_common::static_cell! { + static GC_STATE: GcState; + } GC_STATE.get_or_init(GcState::new) } diff --git a/crates/vm/src/lib.rs b/crates/vm/src/lib.rs index df347124f43..21762466f13 100644 --- a/crates/vm/src/lib.rs +++ b/crates/vm/src/lib.rs @@ -51,6 +51,7 @@ mod codecs; pub mod compiler; pub mod convert; mod coroutine; +pub mod datastack; mod dict_inner; #[cfg(feature = "rustpython-compiler")] diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index d4e7f5b0536..ef65bf59d65 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -74,6 +74,9 @@ pub struct VirtualMachine { pub sys_module: PyRef, pub ctx: PyRc, pub frames: RefCell>, + /// Thread-local data stack for bump-allocating frame-local data + /// (localsplus arrays for non-generator frames). + datastack: core::cell::UnsafeCell, pub wasm_id: Option, exceptions: RefCell, pub import_func: PyObjectRef, @@ -172,6 +175,25 @@ pub fn process_hash_secret_seed() -> u32 { } impl VirtualMachine { + /// Bump-allocate `size` bytes from the thread data stack. + /// + /// # Safety + /// The returned pointer must be freed by calling `datastack_pop` in LIFO order. + #[inline(always)] + pub(crate) fn datastack_push(&self, size: usize) -> *mut u8 { + unsafe { (*self.datastack.get()).push(size) } + } + + /// Pop a previous data stack allocation. + /// + /// # Safety + /// `base` must be a pointer returned by `datastack_push` on this VM, + /// and all allocations made after it must already have been popped. + #[inline(always)] + pub(crate) unsafe fn datastack_pop(&self, base: *mut u8) { + unsafe { (*self.datastack.get()).pop(base) } + } + /// Check whether the current thread is the main thread. /// Mirrors `_Py_ThreadCanHandleSignals`. #[allow(dead_code)] @@ -215,6 +237,7 @@ impl VirtualMachine { sys_module, ctx, frames: RefCell::new(vec![]), + datastack: core::cell::UnsafeCell::new(crate::datastack::DataStack::new()), wasm_id: None, exceptions: RefCell::default(), import_func, @@ -591,7 +614,7 @@ impl VirtualMachine { }; let frame = - Frame::new(code, scope, builtins, &[], Some(func_obj), self).into_ref(&self.ctx); + Frame::new(code, scope, builtins, &[], Some(func_obj), false, self).into_ref(&self.ctx); self.run_frame(frame) } diff --git a/crates/vm/src/vm/thread.rs b/crates/vm/src/vm/thread.rs index c164883ddd9..8dd8e0312ee 100644 --- a/crates/vm/src/vm/thread.rs +++ b/crates/vm/src/vm/thread.rs @@ -309,6 +309,7 @@ impl VirtualMachine { sys_module: self.sys_module.clone(), ctx: self.ctx.clone(), frames: RefCell::new(vec![]), + datastack: core::cell::UnsafeCell::new(crate::datastack::DataStack::new()), wasm_id: self.wasm_id.clone(), exceptions: RefCell::default(), import_func: self.import_func.clone(),