From f869c353cd0c379fbc198c33093010d43103c890 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Tue, 3 Mar 2026 21:17:39 +0900 Subject: [PATCH 01/14] Remove PyMutex from Frame, use UnsafeCell fields directly Move stack, cells_frees, prev_line out of the mutex-protected FrameState into Frame as FrameUnsafeCell fields. This eliminates mutex lock/unlock overhead on every frame execution (with_exec). Safety relies on the same single-threaded execution guarantee that FastLocals already uses. --- crates/vm/src/frame.rs | 236 +++++++++++++++++++++++------------------ 1 file changed, 131 insertions(+), 105 deletions(-) diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index c1a45fc10ce..a548b0a7298 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -67,18 +67,8 @@ enum UnwindReason { Raising { exception: PyBaseExceptionRef }, } -#[derive(Debug)] -struct FrameState { - // We need 1 stack per frame - /// The main data frame of the stack machine - stack: BoxVec>, - /// Cell and free variable references (cellvars + freevars). - cells_frees: Box<[PyCellRef]>, - /// Previous line number for LINE event suppression. - /// Stored here (not on ExecutingFrame) so it persists across - /// generator/coroutine suspend and resume. - prev_line: u32, -} +// FrameState fields are stored directly on Frame as UnsafeCell. +// See Frame.stack, Frame.cells_frees, Frame.prev_line. /// Tracks who owns a frame. // = `_PyFrameOwner` @@ -104,19 +94,40 @@ impl FrameOwner { } } -/// Lock-free storage for local variables (localsplus). +/// Lock-free mutable storage for frame-internal data. /// /// # Safety -/// Mutable access is serialized by the frame's state mutex in `with_exec()`. -/// External readers (e.g. `f_locals`) must use `try_lock` on the state mutex: -/// if acquired, the frame is not executing and access is exclusive; if not, -/// the caller is on the same thread as `with_exec()` (trace callback) and -/// access is safe because frame execution is single-threaded. +/// Frame execution is single-threaded: only one thread at a time executes +/// a given frame (enforced by the owner field and generator running flag). +/// External readers (e.g. `f_locals`) are on the same thread as execution +/// (trace callback) or the frame is not executing. +struct FrameUnsafeCell(UnsafeCell); + +impl FrameUnsafeCell { + fn new(value: T) -> Self { + Self(UnsafeCell::new(value)) + } + + /// # Safety + /// Caller must ensure no concurrent mutable access. + #[inline(always)] + unsafe fn get(&self) -> *mut T { + self.0.get() + } +} + +// SAFETY: Frame execution is single-threaded. See FrameUnsafeCell doc. +#[cfg(feature = "threading")] +unsafe impl Send for FrameUnsafeCell {} +#[cfg(feature = "threading")] +unsafe impl Sync for FrameUnsafeCell {} + +/// Lock-free storage for local variables (localsplus). pub struct FastLocals { inner: UnsafeCell]>>, } -// SAFETY: Frame execution is serialized by the state mutex. +// SAFETY: Frame execution is single-threaded. See FrameUnsafeCell doc. #[cfg(feature = "threading")] unsafe impl Send for FastLocals {} #[cfg(feature = "threading")] @@ -248,7 +259,13 @@ pub struct Frame { pub lasti: PyAtomic, /// tracer function for this frame (usually is None) pub trace: PyMutex, - state: PyMutex, + + /// The main data frame of the stack machine. + stack: FrameUnsafeCell>>, + /// Cell and free variable references (cellvars + freevars). + cells_frees: FrameUnsafeCell>, + /// Previous line number for LINE event suppression. + prev_line: FrameUnsafeCell, // member pub trace_lines: PyMutex, @@ -284,13 +301,6 @@ impl PyPayload for Frame { } } -unsafe impl Traverse for FrameState { - fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { - self.stack.traverse(tracer_fn); - self.cells_frees.traverse(tracer_fn); - } -} - unsafe impl Traverse for Frame { fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { self.code.traverse(tracer_fn); @@ -300,7 +310,12 @@ unsafe impl Traverse for Frame { self.globals.traverse(tracer_fn); self.builtins.traverse(tracer_fn); self.trace.traverse(tracer_fn); - self.state.traverse(tracer_fn); + // SAFETY: GC traversal does not run concurrently with frame execution + // on the same frame. + unsafe { + (*self.stack.get()).traverse(tracer_fn); + (*self.cells_frees.get()).traverse(tracer_fn); + } self.temporary_refs.traverse(tracer_fn); // generator is a borrowed reference, not traversed } @@ -343,12 +358,7 @@ impl Frame { fastlocals_vec[nlocals + i] = Some(cell.clone().into()); } - let state = FrameState { - stack: BoxVec::new(code.max_stackdepth as usize), - cells_frees, - prev_line: 0, - }; - + let max_stackdepth = code.max_stackdepth as usize; Self { fastlocals: FastLocals::new(fastlocals_vec.into_boxed_slice()), locals: match scope.locals { @@ -363,7 +373,9 @@ impl Frame { code, func_obj, lasti: Radium::new(0), - state: PyMutex::new(state), + stack: FrameUnsafeCell::new(BoxVec::new(max_stackdepth)), + cells_frees: FrameUnsafeCell::new(cells_frees), + prev_line: FrameUnsafeCell::new(0), trace: PyMutex::new(vm.ctx.none()), trace_lines: PyMutex::new(true), trace_opcodes: PyMutex::new(false), @@ -380,9 +392,11 @@ impl Frame { /// Clear evaluation stack and state-owned cell/free references. /// For full local/cell cleanup, call `clear_locals_and_stack()`. pub(crate) fn clear_stack_and_cells(&self) { - let mut state = self.state.lock(); - state.stack.clear(); - let _old = core::mem::take(&mut state.cells_frees); + // SAFETY: Called when frame is not executing (generator closed). + unsafe { + (*self.stack.get()).clear(); + let _old = core::mem::take(&mut *self.cells_frees.get()); + } } /// Clear locals and stack after generator/coroutine close. @@ -410,7 +424,8 @@ impl Frame { /// Set cell contents by cell index. Only safe to call before frame execution starts. pub(crate) fn set_cell_contents(&self, cell_idx: usize, value: Option) { - self.state.lock().cells_frees[cell_idx].set(value); + // SAFETY: Called before frame execution starts. + unsafe { (*self.cells_frees.get())[cell_idx].set(value) }; } /// Store a borrowed back-reference to the owning generator/coroutine. @@ -486,10 +501,9 @@ impl Frame { } pub fn locals(&self, vm: &VirtualMachine) -> PyResult { - // Acquire the state mutex to synchronize with frame execution. - // If try_lock fails, the frame is executing on this thread (e.g. - // trace callback accessing f_locals), so fastlocals access is safe. - let _guard = self.state.try_lock(); + // SAFETY: Either the frame is not executing (caller checked owner), + // or we're in a trace callback on the same thread that's executing. + // Same safety argument as FastLocals. let locals = &self.locals; let code = &**self.code; let map = &code.varnames; @@ -534,7 +548,9 @@ impl Frame { impl Py { #[inline(always)] fn with_exec(&self, vm: &VirtualMachine, f: impl FnOnce(ExecutingFrame<'_>) -> R) -> R { - let mut state = self.state.lock(); + // SAFETY: Frame execution is single-threaded. Only one thread at a time + // executes a given frame (enforced by the owner field and generator + // running flag). Same safety argument as FastLocals (UnsafeCell). let exec = ExecutingFrame { code: &self.code, fastlocals: &self.fastlocals, @@ -551,7 +567,9 @@ impl Py { }, lasti: &self.lasti, object: self, - state: &mut state, + stack: unsafe { &mut *self.stack.get() }, + cells_frees: unsafe { &mut *self.cells_frees.get() }, + prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, }; f(exec) @@ -586,9 +604,13 @@ impl Py { } pub fn yield_from_target(&self) -> Option { - // Use try_lock to avoid deadlock when the frame is currently executing. - // A running coroutine has no yield-from target. - let mut state = self.state.try_lock()?; + // If the frame is currently executing (owned by thread), it has no + // yield-from target to report. + let owner = FrameOwner::from_i8(self.owner.load(atomic::Ordering::Acquire)); + if owner == FrameOwner::Thread { + return None; + } + // SAFETY: Frame is not executing, so UnsafeCell access is safe. let exec = ExecutingFrame { code: &self.code, fastlocals: &self.fastlocals, @@ -598,7 +620,9 @@ impl Py { builtins_dict: None, lasti: &self.lasti, object: self, - state: &mut state, + stack: unsafe { &mut *self.stack.get() }, + cells_frees: unsafe { &mut *self.cells_frees.get() }, + prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, }; exec.yield_from_target().map(PyObject::to_owned) @@ -638,7 +662,9 @@ struct ExecutingFrame<'a> { builtins_dict: Option<&'a PyExact>, object: &'a Py, lasti: &'a PyAtomic, - state: &'a mut FrameState, + stack: &'a mut BoxVec>, + cells_frees: &'a mut Box<[PyCellRef]>, + prev_line: &'a mut u32, /// Cached monitoring events mask. Reloaded at Resume instruction only, monitoring_mask: u32, } @@ -647,8 +673,7 @@ impl fmt::Debug for ExecutingFrame<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ExecutingFrame") .field("code", self.code) - // .field("scope", self.scope) - .field("state", self.state) + .field("stack_len", &self.stack.len()) .finish() } } @@ -733,9 +758,9 @@ impl ExecutingFrame<'_> { Instruction::Resume { .. } | Instruction::InstrumentedResume ) && let Some((loc, _)) = self.code.locations.get(idx) - && loc.line.get() as u32 != self.state.prev_line + && loc.line.get() as u32 != *self.prev_line { - self.state.prev_line = loc.line.get() as u32; + *self.prev_line = loc.line.get() as u32; vm.trace_event(crate::protocol::TraceEvent::Line, None)?; // Trace callback may have changed lasti via set_f_lineno. // Re-read and restart the loop from the new position. @@ -767,7 +792,7 @@ impl ExecutingFrame<'_> { | Instruction::InstrumentedLine ) && let Some((loc, _)) = self.code.locations.get(idx) { - self.state.prev_line = loc.line.get() as u32; + *self.prev_line = loc.line.get() as u32; } // Fire 'opcode' trace event for sys.settrace when f_trace_opcodes @@ -987,7 +1012,7 @@ impl ExecutingFrame<'_> { // 3. Stack top is the delegate (receiver) // // First check if stack is empty - if so, we can't be in yield-from - if self.state.stack.is_empty() { + if self.stack.is_empty() { return None; } let lasti = self.lasti() as usize; @@ -1032,7 +1057,7 @@ impl ExecutingFrame<'_> { // In CPython, _Py_call_instrumentation_line has a special case // for RESUME: it fires LINE even when prev_line == current_line. // Since gen_throw bypasses RESUME, we reset prev_line instead. - self.state.prev_line = 0; + *self.prev_line = 0; if let Some(jen) = self.yield_from_target() { // Check if the exception is GeneratorExit (type or instance). // For GeneratorExit, close the sub-iterator instead of throwing. @@ -1453,9 +1478,9 @@ impl ExecutingFrame<'_> { // CopyItem { index: 2 } copies second from top // This is 1-indexed to match CPython let idx = index.get(arg) as usize; - let stack_len = self.state.stack.len(); + let stack_len = self.stack.len(); debug_assert!(stack_len >= idx, "CopyItem: stack underflow"); - let value = self.state.stack[stack_len - idx].clone(); + let value = self.stack[stack_len - idx].clone(); self.push_stackref_opt(value); Ok(None) } @@ -1465,7 +1490,7 @@ impl ExecutingFrame<'_> { } Instruction::DeleteAttr { namei: idx } => self.delete_attr(vm, idx.get(arg)), Instruction::DeleteDeref { i } => { - self.state.cells_frees[i.get(arg) as usize].set(None); + self.cells_frees[i.get(arg) as usize].set(None); Ok(None) } Instruction::DeleteFast { var_num: idx } => { @@ -1645,7 +1670,7 @@ impl ExecutingFrame<'_> { } Instruction::GetANext => { #[cfg(debug_assertions)] // remove when GetANext is fully implemented - let orig_stack_len = self.state.stack.len(); + let orig_stack_len = self.stack.len(); let aiter = self.top_value(); let awaitable = if aiter.class().is(vm.ctx.types.async_generator) { @@ -1685,7 +1710,7 @@ impl ExecutingFrame<'_> { }; self.push_value(awaitable); #[cfg(debug_assertions)] - debug_assert_eq!(orig_stack_len + 1, self.state.stack.len()); + debug_assert_eq!(orig_stack_len + 1, self.stack.len()); Ok(None) } Instruction::GetAwaitable { r#where: oparg } => { @@ -1900,7 +1925,7 @@ impl ExecutingFrame<'_> { }; self.push_value(match value { Some(v) => v, - None => self.state.cells_frees[i] + None => self.cells_frees[i] .get() .ok_or_else(|| self.unbound_cell_exception(i, vm))?, }); @@ -1957,7 +1982,7 @@ impl ExecutingFrame<'_> { } Instruction::LoadDeref { i } => { let idx = i.get(arg) as usize; - let x = self.state.cells_frees[idx] + let x = self.cells_frees[idx] .get() .ok_or_else(|| self.unbound_cell_exception(idx, vm))?; self.push_value(x); @@ -2574,7 +2599,7 @@ impl ExecutingFrame<'_> { } Instruction::StoreDeref { i } => { let value = self.pop_value(); - self.state.cells_frees[i.get(arg) as usize].set(Some(value)); + self.cells_frees[i.get(arg) as usize].set(Some(value)); Ok(None) } Instruction::StoreFast { var_num: idx } => { @@ -2640,7 +2665,7 @@ impl ExecutingFrame<'_> { self.execute_store_subscript(vm) } Instruction::Swap { i: index } => { - let len = self.state.stack.len(); + let len = self.stack.len(); debug_assert!(len > 0, "stack underflow in SWAP"); let i = len - 1; // TOS index let index_val = index.get(arg) as usize; @@ -2653,7 +2678,7 @@ impl ExecutingFrame<'_> { len ); let j = len - index_val; - self.state.stack.swap(i, j); + self.stack.swap(i, j); Ok(None) } Instruction::ToBool => { @@ -2677,9 +2702,9 @@ impl ExecutingFrame<'_> { // __exit__ is at TOS-3 (below lasti, prev_exc, and exc) let exc = vm.current_exception(); - let stack_len = self.state.stack.len(); + let stack_len = self.stack.len(); let exit = expect_unchecked( - self.state.stack[stack_len - 4].clone(), + self.stack[stack_len - 4].clone(), "WithExceptStart: __exit__ is NULL", ); @@ -2696,8 +2721,7 @@ impl ExecutingFrame<'_> { } Instruction::YieldValue { arg: oparg } => { debug_assert!( - self.state - .stack + self.stack .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -3820,7 +3844,7 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 0 { // Stack: [callable, self_or_null] — peek to get func ptr - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - 1].is_some(); let callable = self.nth_value(1); @@ -3856,7 +3880,7 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 1 { // Stack: [callable, self_or_null, arg1] - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - 2].is_some(); let callable = self.nth_value(2); @@ -3893,7 +3917,7 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -4006,7 +4030,7 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -4833,8 +4857,7 @@ impl ExecutingFrame<'_> { } Instruction::InstrumentedYieldValue => { debug_assert!( - self.state - .stack + self.stack .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -5099,8 +5122,8 @@ impl ExecutingFrame<'_> { // Fire LINE event only if line changed if let Some((loc, _)) = self.code.locations.get(idx) { let line = loc.line.get() as u32; - if line != self.state.prev_line && line > 0 { - self.state.prev_line = line; + if line != *self.prev_line && line > 0 { + *self.prev_line = line; monitoring::fire_line(vm, self.code, offset, line)?; } } @@ -5394,8 +5417,8 @@ impl ExecutingFrame<'_> { } // 1. Pop stack to entry.depth - while self.state.stack.len() > entry.depth as usize { - self.state.stack.pop(); + while self.stack.len() > entry.depth as usize { + self.stack.pop(); } // 2. If push_lasti=true (SETUP_CLEANUP), push lasti before exception @@ -5599,7 +5622,7 @@ impl ExecutingFrame<'_> { #[inline] fn execute_call_vectorcall(&mut self, nargs: u32, vm: &VirtualMachine) -> FrameResult { let nargs_usize = nargs as usize; - let stack_len = self.state.stack.len(); + let stack_len = self.stack.len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -5609,7 +5632,7 @@ impl ExecutingFrame<'_> { let args_start = stack_len - nargs_usize; // Build args: [self?, arg1, ..., argN] - let self_or_null = self.state.stack[self_or_null_idx] + let self_or_null = self.stack[self_or_null_idx] .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -5624,12 +5647,12 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.state.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self.stack[stack_idx].take().unwrap().to_pyobj(); args_vec.push(val); } - let callable_obj = self.state.stack[callable_idx].take().unwrap().to_pyobj(); - self.state.stack.truncate(callable_idx); + let callable_obj = self.stack[callable_idx].take().unwrap().to_pyobj(); + self.stack.truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let result = callable_obj.vectorcall(args_vec, effective_nargs, None, vm)?; @@ -5650,7 +5673,7 @@ impl ExecutingFrame<'_> { let kw_count = kwarg_names_tuple.len(); debug_assert!(kw_count <= nargs_usize, "CALL_KW kw_count exceeds nargs"); - let stack_len = self.state.stack.len(); + let stack_len = self.stack.len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL_KW stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -5659,8 +5682,9 @@ impl ExecutingFrame<'_> { let self_or_null_idx = stack_len - nargs_usize - 1; let args_start = stack_len - nargs_usize; + // Build args: [self?, pos_arg1, ..., pos_argM, kw_val1, ..., kw_valK] - let self_or_null = self.state.stack[self_or_null_idx] + let self_or_null = self.stack[self_or_null_idx] .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -5677,12 +5701,12 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.state.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self.stack[stack_idx].take().unwrap().to_pyobj(); args_vec.push(val); } - let callable_obj = self.state.stack[callable_idx].take().unwrap().to_pyobj(); - self.state.stack.truncate(callable_idx); + let callable_obj = self.stack[callable_idx].take().unwrap().to_pyobj(); + self.stack.truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let kwnames = kwarg_names_tuple.as_slice(); @@ -5870,7 +5894,7 @@ impl ExecutingFrame<'_> { let mut elements = elements; // Elements on stack from right-to-left: - self.state.stack.extend( + self.stack.extend( elements .drain(before + middle..) .rev() @@ -5882,7 +5906,7 @@ impl ExecutingFrame<'_> { self.push_value(t.into()); // Lastly the first reversed values: - self.state.stack.extend( + self.stack.extend( elements .into_iter() .rev() @@ -6214,7 +6238,7 @@ impl ExecutingFrame<'_> { Err(vm.new_value_error(msg)) } PyIterReturn::StopIteration(_) => { - self.state.stack.extend( + self.stack.extend( elements .into_iter() .rev() @@ -6863,7 +6887,7 @@ impl ExecutingFrame<'_> { // Stack: [callable, self_or_null, arg1, ..., argN] // callable is at position nargs + 1 from top // self_or_null is at position nargs from top - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); let callable = self.nth_value(nargs + 1); @@ -7035,7 +7059,7 @@ impl ExecutingFrame<'_> { } // Stack: [callable, self_or_null, arg1, ..., argN, kwarg_names] // callable is at position nargs + 2 from top - let stack = &self.state.stack; + let stack = &self.stack; let stack_len = stack.len(); let self_or_null_is_some = stack[stack_len - nargs as usize - 2].is_some(); let callable = self.nth_value(nargs + 2); @@ -7566,7 +7590,7 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn push_stackref_opt(&mut self, obj: Option) { - match self.state.stack.try_push(obj) { + match self.stack.try_push(obj) { Ok(()) => {} Err(_e) => self.fatal("tried to push value onto stack but overflowed max_stackdepth"), } @@ -7604,7 +7628,7 @@ impl ExecutingFrame<'_> { /// Pop a raw stackref from the stack, returning None if the stack slot is NULL. #[inline] fn pop_stackref_opt(&mut self) -> Option { - match self.state.stack.pop() { + match self.stack.pop() { Some(slot) => slot, None => self.fatal("tried to pop from empty stack"), } @@ -7784,7 +7808,7 @@ impl ExecutingFrame<'_> { /// Pop multiple values from the stack. Panics if any slot is NULL. fn pop_multiple(&mut self, count: usize) -> impl ExactSizeIterator + '_ { - let stack_len = self.state.stack.len(); + let stack_len = self.stack.len(); if count > stack_len { let instr = self.code.instructions.get(self.lasti() as usize); let op_name = instr @@ -7800,7 +7824,7 @@ impl ExecutingFrame<'_> { self.code.source_path() ); } - self.state.stack.drain(stack_len - count..).map(|obj| { + self.stack.drain(stack_len - count..).map(|obj| { expect_unchecked(obj, "pop_multiple but null found. This is a compiler bug.").to_pyobj() }) } @@ -7808,7 +7832,7 @@ impl ExecutingFrame<'_> { #[inline] fn replace_top(&mut self, top: Option) -> Option { let mut slot = top.map(PyStackRef::new_owned); - let last = self.state.stack.last_mut().unwrap(); + let last = self.stack.last_mut().unwrap(); core::mem::swap(last, &mut slot); slot.map(|sr| sr.to_pyobj()) } @@ -7816,7 +7840,7 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn top_value(&self) -> &PyObject { - match &*self.state.stack { + match &**self.stack { [.., Some(last)] => last.as_object(), [.., None] => self.fatal("tried to get top of stack but got NULL"), [] => self.fatal("tried to get top of stack but stack is empty"), @@ -7826,7 +7850,7 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn nth_value(&self, depth: u32) -> &PyObject { - let stack = &self.state.stack; + let stack = &self.stack; match &stack[stack.len() - depth as usize - 1] { Some(obj) => obj.as_object(), None => unsafe { core::hint::unreachable_unchecked() }, @@ -7844,8 +7868,10 @@ impl ExecutingFrame<'_> { impl fmt::Debug for Frame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let state = self.state.lock(); - let stack_str = state.stack.iter().fold(String::new(), |mut s, slot| { + // SAFETY: Debug is best-effort; concurrent mutation is unlikely + // and would only affect debug output. + let stack = unsafe { &*self.stack.get() }; + let stack_str = stack.iter().fold(String::new(), |mut s, slot| { match slot { Some(elem) if elem.downcastable::() => { s.push_str("\n > {frame}"); From 85df41e708fbcf514324849dca3e0e712f4ceaab Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Tue, 3 Mar 2026 21:17:49 +0900 Subject: [PATCH 02/14] Add thread-local DataStack for bump-allocating frame data Introduce DataStack with linked chunks (16KB initial, doubling) and push/pop bump allocation. Add datastack field to VirtualMachine. Not yet wired to frame creation. --- crates/vm/src/datastack.rs | 247 +++++++++++++++++++++++++++++++++++++ crates/vm/src/lib.rs | 1 + crates/vm/src/vm/mod.rs | 4 + crates/vm/src/vm/thread.rs | 1 + 4 files changed, 253 insertions(+) create mode 100644 crates/vm/src/datastack.rs diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs new file mode 100644 index 00000000000..2b2ceed9c23 --- /dev/null +++ b/crates/vm/src/datastack.rs @@ -0,0 +1,247 @@ +/// Thread data stack for interpreter frames. +/// +/// A linked list of chunks providing bump allocation for frame-local data +/// (localsplus arrays). Mirrors `_PyStackChunk` / `tstate->datastack_*` +/// from CPython. +/// +/// Normal function calls allocate from the data stack via `push()` (pointer +/// bump). Generators and coroutines use heap-allocated storage instead. +use alloc::alloc::{Layout, alloc, dealloc}; +use core::ptr; + +/// Minimum chunk size in bytes (16 KB, matching CPython `_PY_DATA_STACK_CHUNK_SIZE`). +const MIN_CHUNK_SIZE: usize = 16 * 1024; + +/// Extra headroom (in bytes) to avoid allocating a new chunk for the next +/// frame right after growing. +const MINIMUM_OVERHEAD: usize = 1000 * core::mem::size_of::(); + +/// Alignment for all data stack allocations. +const ALIGN: usize = 16; + +/// Header for a data stack chunk. The usable data region starts right after +/// this header (aligned to `ALIGN`). +#[repr(C)] +struct DataStackChunk { + /// Previous chunk in the linked list (NULL for the root chunk). + previous: *mut DataStackChunk, + /// Total allocation size in bytes (including this header). + size: usize, + /// Saved `top` offset when a newer chunk was pushed. Used to restore + /// `DataStack::top` when popping back to this chunk. + saved_top: usize, +} + +impl DataStackChunk { + /// Pointer to the first usable byte after the header (aligned). + #[inline(always)] + fn data_start(&self) -> *mut u8 { + let header_end = (self as *const Self as usize) + core::mem::size_of::(); + let aligned = (header_end + ALIGN - 1) & !(ALIGN - 1); + aligned as *mut u8 + } + + /// Pointer past the last usable byte. + #[inline(always)] + fn data_limit(&self) -> *mut u8 { + unsafe { (self as *const Self as *mut u8).add(self.size) } + } +} + +/// Per-thread data stack for bump-allocating frame-local data. +pub struct DataStack { + /// Current chunk. + chunk: *mut DataStackChunk, + /// Current allocation position within the current chunk. + top: *mut u8, + /// End of usable space in the current chunk. + limit: *mut u8, +} + +impl DataStack { + /// Create a new data stack with an initial root chunk. + pub fn new() -> Self { + let chunk = Self::alloc_chunk(MIN_CHUNK_SIZE, ptr::null_mut()); + let top = unsafe { (*chunk).data_start() }; + let limit = unsafe { (*chunk).data_limit() }; + // Skip one ALIGN-sized slot in the root chunk so that `pop()` never + // frees it (same trick as CPython's `push_chunk`). + let top = unsafe { top.add(ALIGN) }; + Self { chunk, top, limit } + } + + /// Check if the current chunk has at least `size` bytes available. + #[inline(always)] + pub fn has_space(&self, size: usize) -> bool { + let aligned_size = (size + ALIGN - 1) & !(ALIGN - 1); + (self.limit as usize).saturating_sub(self.top as usize) >= aligned_size + } + + /// Allocate `size` bytes from the data stack. + /// + /// Returns a pointer to the allocated region (aligned to `ALIGN`). + /// The caller must call `pop()` with the returned pointer when done + /// (LIFO order). + #[inline(always)] + pub fn push(&mut self, size: usize) -> *mut u8 { + let aligned_size = (size + ALIGN - 1) & !(ALIGN - 1); + unsafe { + if self.top.add(aligned_size) <= self.limit { + let ptr = self.top; + self.top = self.top.add(aligned_size); + ptr + } else { + self.push_slow(aligned_size) + } + } + } + + /// Slow path: allocate a new chunk and push from it. + #[cold] + #[inline(never)] + fn push_slow(&mut self, aligned_size: usize) -> *mut u8 { + let mut chunk_size = MIN_CHUNK_SIZE; + let needed = aligned_size + MINIMUM_OVERHEAD + core::mem::size_of::() + ALIGN; + while chunk_size < needed { + chunk_size *= 2; + } + // Save current position in old chunk. + unsafe { + (*self.chunk).saved_top = self.top as usize - self.chunk as usize; + } + let new_chunk = Self::alloc_chunk(chunk_size, self.chunk); + self.chunk = new_chunk; + let start = unsafe { (*new_chunk).data_start() }; + self.limit = unsafe { (*new_chunk).data_limit() }; + self.top = unsafe { start.add(aligned_size) }; + start + } + + /// Pop a previous allocation. `base` must be the pointer returned by + /// `push()`. Calls must be in LIFO order. + /// + /// # Safety + /// `base` must be a valid pointer returned by `push()` on this data stack, + /// and all allocations made after it must already have been popped. + #[inline(always)] + pub unsafe fn pop(&mut self, base: *mut u8) { + debug_assert!(!base.is_null()); + let chunk_start = unsafe { (*self.chunk).data_start() }; + if base >= chunk_start { + // Common case: base is within the current chunk. + self.top = base; + } else { + // base is in a previous chunk — free the current chunk. + unsafe { self.pop_slow(base) }; + } + } + + /// Slow path: pop back to a previous chunk. + #[cold] + #[inline(never)] + unsafe fn pop_slow(&mut self, base: *mut u8) { + let old_chunk = self.chunk; + let prev = unsafe { (*old_chunk).previous }; + debug_assert!( + !prev.is_null(), + "tried to pop past the root chunk" + ); + unsafe { Self::free_chunk(old_chunk) }; + self.chunk = prev; + self.top = base; + // If the base is not within this previous chunk either, we have + // a logic error (LIFO violation). + debug_assert!( + base >= unsafe { (*prev).data_start() } && base <= unsafe { (*prev).data_limit() }, + "pop base not in previous chunk (LIFO violation)" + ); + self.limit = unsafe { (*prev).data_limit() }; + } + + /// Allocate a new chunk. + fn alloc_chunk(size: usize, previous: *mut DataStackChunk) -> *mut DataStackChunk { + let layout = Layout::from_size_align(size, ALIGN).expect("invalid chunk layout"); + let ptr = unsafe { alloc(layout) }; + if ptr.is_null() { + alloc::alloc::handle_alloc_error(layout); + } + let chunk = ptr as *mut DataStackChunk; + unsafe { + (*chunk).previous = previous; + (*chunk).size = size; + (*chunk).saved_top = 0; + } + chunk + } + + /// Free a chunk. + unsafe fn free_chunk(chunk: *mut DataStackChunk) { + let size = unsafe { (*chunk).size }; + let layout = Layout::from_size_align(size, ALIGN).expect("invalid chunk layout"); + unsafe { dealloc(chunk as *mut u8, layout) }; + } +} + +// SAFETY: DataStack is per-thread and not shared. The raw pointers +// it contains point to memory exclusively owned by this DataStack. +unsafe impl Send for DataStack {} + +impl Default for DataStack { + fn default() -> Self { + Self::new() + } +} + +impl Drop for DataStack { + fn drop(&mut self) { + let mut chunk = self.chunk; + while !chunk.is_null() { + let prev = unsafe { (*chunk).previous }; + unsafe { Self::free_chunk(chunk) }; + chunk = prev; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_push_pop() { + let mut ds = DataStack::new(); + let p1 = ds.push(64); + assert!(!p1.is_null()); + let p2 = ds.push(128); + assert!(!p2.is_null()); + assert!(p2 > p1); + unsafe { + ds.pop(p2); + ds.pop(p1); + } + } + + #[test] + fn cross_chunk_push_pop() { + let mut ds = DataStack::new(); + // Push enough to force a new chunk + let mut ptrs = Vec::new(); + for _ in 0..100 { + ptrs.push(ds.push(1024)); + } + // Pop all in reverse + for p in ptrs.into_iter().rev() { + unsafe { ds.pop(p) }; + } + } + + #[test] + fn alignment() { + let mut ds = DataStack::new(); + for size in [1, 7, 15, 16, 17, 31, 32, 33, 64, 100] { + let p = ds.push(size); + assert_eq!(p as usize % ALIGN, 0, "alignment violated for size {size}"); + unsafe { ds.pop(p) }; + } + } +} diff --git a/crates/vm/src/lib.rs b/crates/vm/src/lib.rs index df347124f43..21762466f13 100644 --- a/crates/vm/src/lib.rs +++ b/crates/vm/src/lib.rs @@ -51,6 +51,7 @@ mod codecs; pub mod compiler; pub mod convert; mod coroutine; +pub mod datastack; mod dict_inner; #[cfg(feature = "rustpython-compiler")] diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index d4e7f5b0536..4b3cd880db6 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -74,6 +74,9 @@ pub struct VirtualMachine { pub sys_module: PyRef, pub ctx: PyRc, pub frames: RefCell>, + /// Thread-local data stack for bump-allocating frame-local data + /// (localsplus arrays for non-generator frames). + pub datastack: crate::datastack::DataStack, pub wasm_id: Option, exceptions: RefCell, pub import_func: PyObjectRef, @@ -215,6 +218,7 @@ impl VirtualMachine { sys_module, ctx, frames: RefCell::new(vec![]), + datastack: crate::datastack::DataStack::new(), wasm_id: None, exceptions: RefCell::default(), import_func, diff --git a/crates/vm/src/vm/thread.rs b/crates/vm/src/vm/thread.rs index c164883ddd9..1dee8bba285 100644 --- a/crates/vm/src/vm/thread.rs +++ b/crates/vm/src/vm/thread.rs @@ -309,6 +309,7 @@ impl VirtualMachine { sys_module: self.sys_module.clone(), ctx: self.ctx.clone(), frames: RefCell::new(vec![]), + datastack: crate::datastack::DataStack::new(), wasm_id: self.wasm_id.clone(), exceptions: RefCell::default(), import_func: self.import_func.clone(), From e286597ac2e99f7de68fb0b3cc09bd810c05ef09 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Tue, 3 Mar 2026 21:57:16 +0900 Subject: [PATCH 03/14] Unify FastLocals and BoxVec stack into LocalsPlus Replace separate FastLocals (Box<[Option]>) and BoxVec> with a single LocalsPlus struct that stores both in a contiguous Box<[usize]> array. The first nlocalsplus slots are fastlocals and the rest is the evaluation stack. Typed access is provided through transmute-based methods. Remove BoxVec import from frame.rs. --- crates/vm/src/builtins/frame.rs | 2 +- crates/vm/src/builtins/function.rs | 6 +- crates/vm/src/builtins/super.rs | 2 +- crates/vm/src/frame.rs | 452 +++++++++++++++++++++-------- 4 files changed, 338 insertions(+), 124 deletions(-) diff --git a/crates/vm/src/builtins/frame.rs b/crates/vm/src/builtins/frame.rs index 04abe4772d7..4a8549239e7 100644 --- a/crates/vm/src/builtins/frame.rs +++ b/crates/vm/src/builtins/frame.rs @@ -692,7 +692,7 @@ impl Py { // Clear fastlocals // SAFETY: Frame is not executing (detached or stopped). { - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { self.fastlocals_mut() }; for slot in fastlocals.iter_mut() { *slot = None; } diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 316f739c9c9..3fd7dd5418f 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -236,7 +236,7 @@ impl PyFunction { // https://github.com/python/cpython/blob/main/Python/ceval.c#L3681 // SAFETY: Frame was just created and not yet executing. - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; let mut args_iter = func_args.args.into_iter(); @@ -671,7 +671,7 @@ impl Py { // Move args directly into fastlocals (no clone/refcount needed) { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..)) { *slot = Some(arg); } @@ -679,7 +679,7 @@ impl Py { // Handle cell2arg if let Some(cell2arg) = code.cell2arg.as_deref() { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { let x = fastlocals[*arg_idx as usize].take(); frame.set_cell_contents(cell_idx, x); diff --git a/crates/vm/src/builtins/super.rs b/crates/vm/src/builtins/super.rs index 4fb81b6e591..01bdfa6749e 100644 --- a/crates/vm/src/builtins/super.rs +++ b/crates/vm/src/builtins/super.rs @@ -86,7 +86,7 @@ impl Initializer for PySuper { return Err(vm.new_runtime_error("super(): no arguments")); } // SAFETY: Frame is current and not concurrently mutated. - let obj = unsafe { frame.fastlocals.borrow() }[0] + let obj = unsafe { frame.fastlocals() }[0] .clone() .or_else(|| { if let Some(cell2arg) = frame.code.cell2arg.as_deref() { diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index a548b0a7298..4eb554647f4 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -46,7 +46,6 @@ use malachite_bigint::BigInt; use num_traits::Zero; use rustpython_common::atomic::{PyAtomic, Radium}; use rustpython_common::{ - boxvec::BoxVec, lock::{OnceCell, PyMutex}, wtf8::{Wtf8, Wtf8Buf, wtf8_concat}, }; @@ -122,46 +121,256 @@ unsafe impl Send for FrameUnsafeCell {} #[cfg(feature = "threading")] unsafe impl Sync for FrameUnsafeCell {} -/// Lock-free storage for local variables (localsplus). -pub struct FastLocals { - inner: UnsafeCell]>>, +/// Unified storage for local variables and evaluation stack. +/// +/// Memory layout (each slot is `usize`-sized): +/// `[0..nlocalsplus)` — fastlocals (`Option`) +/// `[nlocalsplus..nlocalsplus+stack_top)` — active evaluation stack (`Option`) +/// `[nlocalsplus+stack_top..capacity)` — unused stack capacity +/// +/// Both `Option` and `Option` are `usize`-sized +/// (niche optimization on NonNull / NonZeroUsize). The raw storage is +/// `usize` to unify them; typed access is provided through methods. +pub struct LocalsPlus { + /// Raw storage. `0` represents `None` for both types. + data: Box<[usize]>, + /// Number of fastlocals slots (nlocals + ncells + nfrees). + nlocalsplus: u32, + /// Current evaluation stack depth. + stack_top: u32, } -// SAFETY: Frame execution is single-threaded. See FrameUnsafeCell doc. -#[cfg(feature = "threading")] -unsafe impl Send for FastLocals {} -#[cfg(feature = "threading")] -unsafe impl Sync for FastLocals {} +const _: () = { + assert!(core::mem::size_of::>() == core::mem::size_of::()); + // PyStackRef size is checked in object/core.rs +}; -impl FastLocals { - fn new(data: Box<[Option]>) -> Self { +impl LocalsPlus { + /// Create a new LocalsPlus with `nlocalsplus` fastlocals slots + /// and `stacksize` evaluation stack capacity. All slots start as None (0). + fn new(nlocalsplus: usize, stacksize: usize) -> Self { + let capacity = nlocalsplus + stacksize; Self { - inner: UnsafeCell::new(data), + data: vec![0usize; capacity].into_boxed_slice(), + nlocalsplus: nlocalsplus as u32, + stack_top: 0, } } - /// # Safety - /// Caller must ensure exclusive access (frame state locked or frame - /// not executing). + /// Total capacity (fastlocals + stack). #[inline(always)] - pub unsafe fn borrow(&self) -> &[Option] { - unsafe { &*self.inner.get() } + fn capacity(&self) -> usize { + self.data.len() } - /// # Safety - /// Caller must ensure exclusive mutable access. + /// Stack capacity (max stack depth). #[inline(always)] - #[allow(clippy::mut_from_ref)] - pub unsafe fn borrow_mut(&self) -> &mut [Option] { - unsafe { &mut *self.inner.get() } + fn stack_capacity(&self) -> usize { + self.capacity() - self.nlocalsplus as usize + } + + // -- Fastlocals access -- + + /// Immutable access to fastlocals as `Option` slice. + #[inline(always)] + fn fastlocals(&self) -> &[Option] { + let ptr = self.data.as_ptr() as *const Option; + unsafe { core::slice::from_raw_parts(ptr, self.nlocalsplus as usize) } + } + + /// Mutable access to fastlocals as `Option` slice. + #[inline(always)] + fn fastlocals_mut(&mut self) -> &mut [Option] { + let ptr = self.data.as_mut_ptr() as *mut Option; + unsafe { core::slice::from_raw_parts_mut(ptr, self.nlocalsplus as usize) } + } + + // -- Stack access -- + + /// Current stack depth. + #[inline(always)] + fn stack_len(&self) -> usize { + self.stack_top as usize + } + + /// Whether the stack is empty. + #[inline(always)] + fn stack_is_empty(&self) -> bool { + self.stack_top == 0 + } + + /// Push a value onto the evaluation stack. + #[inline(always)] + fn stack_push(&mut self, val: Option) { + let idx = self.nlocalsplus as usize + self.stack_top as usize; + debug_assert!(idx < self.capacity(), "stack overflow: stack_top={}, capacity={}", self.stack_top, self.stack_capacity()); + // SAFETY: Option is usize-sized. We transfer ownership + // by writing the raw bits and forgetting the original value. + self.data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + self.stack_top += 1; + } + + /// Try to push; returns Err if stack is full. + #[inline(always)] + fn stack_try_push(&mut self, val: Option) -> Result<(), Option> { + let idx = self.nlocalsplus as usize + self.stack_top as usize; + if idx >= self.capacity() { + return Err(val); + } + self.data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + self.stack_top += 1; + Ok(()) + } + + /// Pop a value from the evaluation stack. + #[inline(always)] + fn stack_pop(&mut self) -> Option { + debug_assert!(self.stack_top > 0, "stack underflow"); + self.stack_top -= 1; + let idx = self.nlocalsplus as usize + self.stack_top as usize; + let raw = core::mem::replace(&mut self.data[idx], 0); + unsafe { core::mem::transmute::>(raw) } + } + + /// Immutable view of the active stack as `Option` slice. + #[inline(always)] + fn stack_as_slice(&self) -> &[Option] { + let base = self.nlocalsplus as usize; + let ptr = unsafe { (self.data.as_ptr().add(base)) as *const Option }; + unsafe { core::slice::from_raw_parts(ptr, self.stack_top as usize) } + } + + /// Get a reference to a stack slot by index from the bottom. + #[inline(always)] + fn stack_index(&self, idx: usize) -> &Option { + debug_assert!(idx < self.stack_top as usize); + let raw_idx = self.nlocalsplus as usize + idx; + unsafe { &*(self.data.as_ptr().add(raw_idx) as *const Option) } + } + + /// Get a mutable reference to a stack slot by index from the bottom. + #[inline(always)] + fn stack_index_mut(&mut self, idx: usize) -> &mut Option { + debug_assert!(idx < self.stack_top as usize); + let raw_idx = self.nlocalsplus as usize + idx; + unsafe { &mut *(self.data.as_mut_ptr().add(raw_idx) as *mut Option) } + } + + /// Get the last stack element (top of stack). + #[inline(always)] + fn stack_last(&self) -> Option<&Option> { + if self.stack_top == 0 { + None + } else { + Some(self.stack_index(self.stack_top as usize - 1)) + } + } + + /// Get mutable reference to the last stack element. + #[inline(always)] + fn stack_last_mut(&mut self) -> Option<&mut Option> { + if self.stack_top == 0 { + None + } else { + let idx = self.stack_top as usize - 1; + Some(self.stack_index_mut(idx)) + } + } + + /// Swap two stack elements. + #[inline(always)] + fn stack_swap(&mut self, a: usize, b: usize) { + let base = self.nlocalsplus as usize; + self.data.swap(base + a, base + b); + } + + /// Clear the stack, dropping all values. + fn stack_clear(&mut self) { + while self.stack_top > 0 { + let _ = self.stack_pop(); + } + } + + /// Drain stack elements from `from` to the end, returning an iterator + /// that yields `Option` in forward order and shrinks the stack. + fn stack_drain(&mut self, from: usize) -> impl ExactSizeIterator> + '_ { + let end = self.stack_top as usize; + debug_assert!(from <= end); + // Reduce stack_top now; the drain iterator owns the elements. + self.stack_top = from as u32; + LocalsPlusStackDrain { + localsplus: self, + current: from, + end, + } + } + + /// Extend the stack with values from an iterator. + fn stack_extend(&mut self, iter: impl Iterator>) { + for val in iter { + self.stack_push(val); + } + } +} + +/// Iterator for draining stack elements in forward order. +struct LocalsPlusStackDrain<'a> { + localsplus: &'a mut LocalsPlus, + /// Current read position (stack-relative index). + current: usize, + /// End position (exclusive, stack-relative index). + end: usize, +} + +impl Iterator for LocalsPlusStackDrain<'_> { + type Item = Option; + + fn next(&mut self) -> Option { + if self.current >= self.end { + return None; + } + let idx = self.localsplus.nlocalsplus as usize + self.current; + let raw = core::mem::replace(&mut self.localsplus.data[idx], 0); + self.current += 1; + Some(unsafe { core::mem::transmute::>(raw) }) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.end - self.current; + (remaining, Some(remaining)) + } +} + +impl ExactSizeIterator for LocalsPlusStackDrain<'_> {} + +impl Drop for LocalsPlusStackDrain<'_> { + fn drop(&mut self) { + // Drop any remaining elements that weren't consumed. + while self.current < self.end { + let idx = self.localsplus.nlocalsplus as usize + self.current; + let raw = core::mem::replace(&mut self.localsplus.data[idx], 0); + let _ = unsafe { core::mem::transmute::>(raw) }; + self.current += 1; + } + } +} + +impl Drop for LocalsPlus { + fn drop(&mut self) { + // Drop active stack entries as Option. + self.stack_clear(); + // Drop fastlocals entries as Option. + let fastlocals = self.fastlocals_mut(); + for slot in fastlocals.iter_mut() { + let _ = slot.take(); + } } } -unsafe impl Traverse for FastLocals { - fn traverse(&self, traverse_fn: &mut TraverseFn<'_>) { - // SAFETY: GC runs on the same thread; no concurrent mutation. - let data = unsafe { &*self.inner.get() }; - data.traverse(traverse_fn); +unsafe impl Traverse for LocalsPlus { + fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { + self.fastlocals().traverse(tracer_fn); + self.stack_as_slice().traverse(tracer_fn); } } @@ -250,7 +459,8 @@ pub struct Frame { pub code: PyRef, pub func_obj: Option, - pub fastlocals: FastLocals, + /// Unified storage for local variables and evaluation stack. + localsplus: FrameUnsafeCell, pub locals: FrameLocals, pub globals: PyDictRef, pub builtins: PyObjectRef, @@ -260,8 +470,6 @@ pub struct Frame { /// tracer function for this frame (usually is None) pub trace: PyMutex, - /// The main data frame of the stack machine. - stack: FrameUnsafeCell>>, /// Cell and free variable references (cellvars + freevars). cells_frees: FrameUnsafeCell>, /// Previous line number for LINE event suppression. @@ -305,19 +513,16 @@ unsafe impl Traverse for Frame { fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { self.code.traverse(tracer_fn); self.func_obj.traverse(tracer_fn); - self.fastlocals.traverse(tracer_fn); + // SAFETY: GC traversal does not run concurrently with frame execution. + unsafe { + (*self.localsplus.get()).traverse(tracer_fn); + (*self.cells_frees.get()).traverse(tracer_fn); + } self.locals.traverse(tracer_fn); self.globals.traverse(tracer_fn); self.builtins.traverse(tracer_fn); self.trace.traverse(tracer_fn); - // SAFETY: GC traversal does not run concurrently with frame execution - // on the same frame. - unsafe { - (*self.stack.get()).traverse(tracer_fn); - (*self.cells_frees.get()).traverse(tracer_fn); - } self.temporary_refs.traverse(tracer_fn); - // generator is a borrowed reference, not traversed } } @@ -349,18 +554,18 @@ impl Frame { .chain(closure.iter().cloned()) .collect(); - // Extend fastlocals to include varnames + cellvars + freevars (localsplus) - let total_locals = nlocals + num_cells + nfrees; - let mut fastlocals_vec: Vec> = vec![None; total_locals]; + // Create unified localsplus: varnames + cellvars + freevars + stack + let nlocalsplus = nlocals + num_cells + nfrees; + let max_stackdepth = code.max_stackdepth as usize; + let mut localsplus = LocalsPlus::new(nlocalsplus, max_stackdepth); // Store cell objects at cellvars and freevars positions for (i, cell) in cells_frees.iter().enumerate() { - fastlocals_vec[nlocals + i] = Some(cell.clone().into()); + localsplus.fastlocals_mut()[nlocals + i] = Some(cell.clone().into()); } - let max_stackdepth = code.max_stackdepth as usize; Self { - fastlocals: FastLocals::new(fastlocals_vec.into_boxed_slice()), + localsplus: FrameUnsafeCell::new(localsplus), locals: match scope.locals { Some(locals) => FrameLocals::with_locals(locals), None if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) => FrameLocals::lazy(), @@ -373,7 +578,6 @@ impl Frame { code, func_obj, lasti: Radium::new(0), - stack: FrameUnsafeCell::new(BoxVec::new(max_stackdepth)), cells_frees: FrameUnsafeCell::new(cells_frees), prev_line: FrameUnsafeCell::new(0), trace: PyMutex::new(vm.ctx.none()), @@ -389,12 +593,32 @@ impl Frame { } } + /// Access fastlocals immutably. + /// + /// # Safety + /// Caller must ensure no concurrent mutable access (frame not executing, + /// or called from the same thread during trace callback). + #[inline(always)] + pub unsafe fn fastlocals(&self) -> &[Option] { + unsafe { (*self.localsplus.get()).fastlocals() } + } + + /// Access fastlocals mutably. + /// + /// # Safety + /// Caller must ensure exclusive access (frame not executing). + #[inline(always)] + #[allow(clippy::mut_from_ref)] + pub unsafe fn fastlocals_mut(&self) -> &mut [Option] { + unsafe { (*self.localsplus.get()).fastlocals_mut() } + } + /// Clear evaluation stack and state-owned cell/free references. /// For full local/cell cleanup, call `clear_locals_and_stack()`. pub(crate) fn clear_stack_and_cells(&self) { // SAFETY: Called when frame is not executing (generator closed). unsafe { - (*self.stack.get()).clear(); + (*self.localsplus.get()).stack_clear(); let _old = core::mem::take(&mut *self.cells_frees.get()); } } @@ -404,7 +628,7 @@ impl Frame { pub(crate) fn clear_locals_and_stack(&self) { self.clear_stack_and_cells(); // SAFETY: Frame is not executing (generator closed). - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals_mut() }; for slot in fastlocals.iter_mut() { *slot = None; } @@ -414,7 +638,7 @@ impl Frame { pub(crate) fn get_cell_contents(&self, cell_idx: usize) -> Option { let nlocals = self.code.varnames.len(); // SAFETY: Frame not executing; no concurrent mutation. - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals() }; fastlocals .get(nlocals + cell_idx) .and_then(|slot| slot.as_ref()) @@ -484,7 +708,7 @@ impl Frame { } let code = &**self.code; // SAFETY: Called before generator resume; no concurrent access. - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals_mut() }; let locals_map = self.locals.mapping(vm); for (i, &varname) in code.varnames.iter().enumerate() { if i >= fastlocals.len() { @@ -503,16 +727,13 @@ impl Frame { pub fn locals(&self, vm: &VirtualMachine) -> PyResult { // SAFETY: Either the frame is not executing (caller checked owner), // or we're in a trace callback on the same thread that's executing. - // Same safety argument as FastLocals. let locals = &self.locals; let code = &**self.code; let map = &code.varnames; let j = core::cmp::min(map.len(), code.varnames.len()); let locals_map = locals.mapping(vm); if !code.varnames.is_empty() { - // SAFETY: Either _guard holds the state mutex (frame not executing), - // or we're in a trace callback on the same thread that holds it. - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = unsafe { (*self.localsplus.get()).fastlocals() }; for (&k, v) in zip(&map[..j], fastlocals) { match locals_map.ass_subscript(k, v.clone(), vm) { Ok(()) => {} @@ -553,7 +774,7 @@ impl Py { // running flag). Same safety argument as FastLocals (UnsafeCell). let exec = ExecutingFrame { code: &self.code, - fastlocals: &self.fastlocals, + localsplus: unsafe { &mut *self.localsplus.get() }, locals: &self.locals, globals: &self.globals, builtins: &self.builtins, @@ -567,7 +788,6 @@ impl Py { }, lasti: &self.lasti, object: self, - stack: unsafe { &mut *self.stack.get() }, cells_frees: unsafe { &mut *self.cells_frees.get() }, prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, @@ -613,14 +833,13 @@ impl Py { // SAFETY: Frame is not executing, so UnsafeCell access is safe. let exec = ExecutingFrame { code: &self.code, - fastlocals: &self.fastlocals, + localsplus: unsafe { &mut *self.localsplus.get() }, locals: &self.locals, globals: &self.globals, builtins: &self.builtins, builtins_dict: None, lasti: &self.lasti, object: self, - stack: unsafe { &mut *self.stack.get() }, cells_frees: unsafe { &mut *self.cells_frees.get() }, prev_line: unsafe { &mut *self.prev_line.get() }, monitoring_mask: 0, @@ -651,7 +870,7 @@ impl Py { /// with the mutable data inside struct ExecutingFrame<'a> { code: &'a PyRef, - fastlocals: &'a FastLocals, + localsplus: &'a mut LocalsPlus, locals: &'a FrameLocals, globals: &'a PyDictRef, builtins: &'a PyObjectRef, @@ -662,7 +881,6 @@ struct ExecutingFrame<'a> { builtins_dict: Option<&'a PyExact>, object: &'a Py, lasti: &'a PyAtomic, - stack: &'a mut BoxVec>, cells_frees: &'a mut Box<[PyCellRef]>, prev_line: &'a mut u32, /// Cached monitoring events mask. Reloaded at Resume instruction only, @@ -673,7 +891,7 @@ impl fmt::Debug for ExecutingFrame<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ExecutingFrame") .field("code", self.code) - .field("stack_len", &self.stack.len()) + .field("stack_len", &self.localsplus.stack_len()) .finish() } } @@ -1012,7 +1230,7 @@ impl ExecutingFrame<'_> { // 3. Stack top is the delegate (receiver) // // First check if stack is empty - if so, we can't be in yield-from - if self.stack.is_empty() { + if self.localsplus.stack_is_empty() { return None; } let lasti = self.lasti() as usize; @@ -1478,9 +1696,9 @@ impl ExecutingFrame<'_> { // CopyItem { index: 2 } copies second from top // This is 1-indexed to match CPython let idx = index.get(arg) as usize; - let stack_len = self.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!(stack_len >= idx, "CopyItem: stack underflow"); - let value = self.stack[stack_len - idx].clone(); + let value = self.localsplus.stack_index(stack_len - idx).clone(); self.push_stackref_opt(value); Ok(None) } @@ -1494,7 +1712,7 @@ impl ExecutingFrame<'_> { Ok(None) } Instruction::DeleteFast { var_num: idx } => { - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); let idx = idx.get(arg) as usize; if fastlocals[idx].is_none() { return Err(vm.new_exception_msg( @@ -1670,7 +1888,7 @@ impl ExecutingFrame<'_> { } Instruction::GetANext => { #[cfg(debug_assertions)] // remove when GetANext is fully implemented - let orig_stack_len = self.stack.len(); + let orig_stack_len = self.localsplus.stack_len(); let aiter = self.top_value(); let awaitable = if aiter.class().is(vm.ctx.types.async_generator) { @@ -1710,7 +1928,7 @@ impl ExecutingFrame<'_> { }; self.push_value(awaitable); #[cfg(debug_assertions)] - debug_assert_eq!(orig_stack_len + 1, self.stack.len()); + debug_assert_eq!(orig_stack_len + 1, self.localsplus.stack_len()); Ok(None) } Instruction::GetAwaitable { r#where: oparg } => { @@ -2000,7 +2218,7 @@ impl ExecutingFrame<'_> { ) } let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] + let x = self.localsplus.fastlocals()[idx] .clone() .ok_or_else(|| reference_error(self.code.varnames[idx], vm))?; self.push_value(x); @@ -2010,7 +2228,7 @@ impl ExecutingFrame<'_> { // Load value and clear the slot (for inlined comprehensions) // If slot is empty, push None (not an error - variable may not exist yet) let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow_mut() }[idx] + let x = self.localsplus.fastlocals_mut()[idx] .take() .unwrap_or_else(|| vm.ctx.none()); self.push_value(x); @@ -2020,7 +2238,7 @@ impl ExecutingFrame<'_> { // Same as LoadFast but explicitly checks for unbound locals // (LoadFast in RustPython already does this check) let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] + let x = self.localsplus.fastlocals()[idx] .clone() .ok_or_else(|| { vm.new_exception_msg( @@ -2041,7 +2259,7 @@ impl ExecutingFrame<'_> { let oparg = packed.get(arg); let idx1 = (oparg >> 4) as usize; let idx2 = (oparg & 15) as usize; - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = self.localsplus.fastlocals(); let x1 = fastlocals[idx1].clone().ok_or_else(|| { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), @@ -2071,7 +2289,7 @@ impl ExecutingFrame<'_> { // lifetime issues at yield/exception points are resolved. Instruction::LoadFastBorrow { var_num: idx } => { let idx = idx.get(arg) as usize; - let x = unsafe { self.fastlocals.borrow() }[idx] + let x = self.localsplus.fastlocals()[idx] .clone() .ok_or_else(|| { vm.new_exception_msg( @@ -2090,7 +2308,7 @@ impl ExecutingFrame<'_> { let oparg = packed.get(arg); let idx1 = (oparg >> 4) as usize; let idx2 = (oparg & 15) as usize; - let fastlocals = unsafe { self.fastlocals.borrow() }; + let fastlocals = self.localsplus.fastlocals(); let x1 = fastlocals[idx1].clone().ok_or_else(|| { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), @@ -2604,13 +2822,13 @@ impl ExecutingFrame<'_> { } Instruction::StoreFast { var_num: idx } => { let value = self.pop_value(); - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); fastlocals[idx.get(arg) as usize] = Some(value); Ok(None) } Instruction::StoreFastLoadFast { var_nums } => { let value = self.pop_value(); - let locals = unsafe { self.fastlocals.borrow_mut() }; + let locals = self.localsplus.fastlocals_mut(); let oparg = var_nums.get(arg); locals[oparg.store_idx() as usize] = Some(value); let load_value = locals[oparg.load_idx() as usize] @@ -2625,7 +2843,7 @@ impl ExecutingFrame<'_> { let idx2 = (oparg & 15) as usize; let value1 = self.pop_value(); let value2 = self.pop_value(); - let fastlocals = unsafe { self.fastlocals.borrow_mut() }; + let fastlocals = self.localsplus.fastlocals_mut(); fastlocals[idx1] = Some(value1); fastlocals[idx2] = Some(value2); Ok(None) @@ -2665,7 +2883,7 @@ impl ExecutingFrame<'_> { self.execute_store_subscript(vm) } Instruction::Swap { i: index } => { - let len = self.stack.len(); + let len = self.localsplus.stack_len(); debug_assert!(len > 0, "stack underflow in SWAP"); let i = len - 1; // TOS index let index_val = index.get(arg) as usize; @@ -2678,7 +2896,7 @@ impl ExecutingFrame<'_> { len ); let j = len - index_val; - self.stack.swap(i, j); + self.localsplus.stack_swap(i, j); Ok(None) } Instruction::ToBool => { @@ -2702,9 +2920,9 @@ impl ExecutingFrame<'_> { // __exit__ is at TOS-3 (below lasti, prev_exc, and exc) let exc = vm.current_exception(); - let stack_len = self.stack.len(); + let stack_len = self.localsplus.stack_len(); let exit = expect_unchecked( - self.stack[stack_len - 4].clone(), + self.localsplus.stack_index(stack_len - 4).clone(), "WithExceptStart: __exit__ is NULL", ); @@ -2721,7 +2939,8 @@ impl ExecutingFrame<'_> { } Instruction::YieldValue { arg: oparg } => { debug_assert!( - self.stack + self.localsplus + .stack_as_slice() .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -3844,9 +4063,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 0 { // Stack: [callable, self_or_null] — peek to get func ptr - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - 1).is_some(); let callable = self.nth_value(1); let callable_tag = callable as *const PyObject as u32; let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -3880,9 +4098,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); if nargs == 1 { // Stack: [callable, self_or_null, arg1] - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - 2].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - 2).is_some(); let callable = self.nth_value(2); let callable_tag = callable as *const PyObject as u32; let func = if cached_tag == callable_tag && self_or_null_is_some { @@ -3917,9 +4134,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -4030,9 +4246,8 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -4857,7 +5072,8 @@ impl ExecutingFrame<'_> { } Instruction::InstrumentedYieldValue => { debug_assert!( - self.stack + self.localsplus + .stack_as_slice() .iter() .flatten() .all(|sr| !sr.is_borrowed()), @@ -5417,8 +5633,8 @@ impl ExecutingFrame<'_> { } // 1. Pop stack to entry.depth - while self.stack.len() > entry.depth as usize { - self.stack.pop(); + while self.localsplus.stack_len() > entry.depth as usize { + let _ = self.localsplus.stack_pop(); } // 2. If push_lasti=true (SETUP_CLEANUP), push lasti before exception @@ -5894,7 +6110,7 @@ impl ExecutingFrame<'_> { let mut elements = elements; // Elements on stack from right-to-left: - self.stack.extend( + self.localsplus.stack_extend( elements .drain(before + middle..) .rev() @@ -5906,7 +6122,7 @@ impl ExecutingFrame<'_> { self.push_value(t.into()); // Lastly the first reversed values: - self.stack.extend( + self.localsplus.stack_extend( elements .into_iter() .rev() @@ -6238,7 +6454,7 @@ impl ExecutingFrame<'_> { Err(vm.new_value_error(msg)) } PyIterReturn::StopIteration(_) => { - self.stack.extend( + self.localsplus.stack_extend( elements .into_iter() .rev() @@ -6887,9 +7103,8 @@ impl ExecutingFrame<'_> { // Stack: [callable, self_or_null, arg1, ..., argN] // callable is at position nargs + 1 from top // self_or_null is at position nargs from top - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); let callable = self.nth_value(nargs + 1); if let Some(func) = callable.downcast_ref::() { @@ -7059,9 +7274,8 @@ impl ExecutingFrame<'_> { } // Stack: [callable, self_or_null, arg1, ..., argN, kwarg_names] // callable is at position nargs + 2 from top - let stack = &self.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 2].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 2).is_some(); let callable = self.nth_value(nargs + 2); if let Some(func) = callable.downcast_ref::() { @@ -7590,7 +7804,7 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn push_stackref_opt(&mut self, obj: Option) { - match self.stack.try_push(obj) { + match self.localsplus.stack_try_push(obj) { Ok(()) => {} Err(_e) => self.fatal("tried to push value onto stack but overflowed max_stackdepth"), } @@ -7628,10 +7842,10 @@ impl ExecutingFrame<'_> { /// Pop a raw stackref from the stack, returning None if the stack slot is NULL. #[inline] fn pop_stackref_opt(&mut self) -> Option { - match self.stack.pop() { - Some(slot) => slot, - None => self.fatal("tried to pop from empty stack"), + if self.localsplus.stack_is_empty() { + self.fatal("tried to pop from empty stack"); } + self.localsplus.stack_pop() } /// Pop a raw stackref from the stack. Panics if NULL. @@ -7808,7 +8022,7 @@ impl ExecutingFrame<'_> { /// Pop multiple values from the stack. Panics if any slot is NULL. fn pop_multiple(&mut self, count: usize) -> impl ExactSizeIterator + '_ { - let stack_len = self.stack.len(); + let stack_len = self.localsplus.stack_len(); if count > stack_len { let instr = self.code.instructions.get(self.lasti() as usize); let op_name = instr @@ -7824,7 +8038,7 @@ impl ExecutingFrame<'_> { self.code.source_path() ); } - self.stack.drain(stack_len - count..).map(|obj| { + self.localsplus.stack_drain(stack_len - count).map(|obj| { expect_unchecked(obj, "pop_multiple but null found. This is a compiler bug.").to_pyobj() }) } @@ -7832,7 +8046,7 @@ impl ExecutingFrame<'_> { #[inline] fn replace_top(&mut self, top: Option) -> Option { let mut slot = top.map(PyStackRef::new_owned); - let last = self.stack.last_mut().unwrap(); + let last = self.localsplus.stack_last_mut().unwrap(); core::mem::swap(last, &mut slot); slot.map(|sr| sr.to_pyobj()) } @@ -7840,18 +8054,18 @@ impl ExecutingFrame<'_> { #[inline] #[track_caller] fn top_value(&self) -> &PyObject { - match &**self.stack { - [.., Some(last)] => last.as_object(), - [.., None] => self.fatal("tried to get top of stack but got NULL"), - [] => self.fatal("tried to get top of stack but stack is empty"), + match self.localsplus.stack_last() { + Some(Some(last)) => last.as_object(), + Some(None) => self.fatal("tried to get top of stack but got NULL"), + None => self.fatal("tried to get top of stack but stack is empty"), } } #[inline] #[track_caller] fn nth_value(&self, depth: u32) -> &PyObject { - let stack = &self.stack; - match &stack[stack.len() - depth as usize - 1] { + let idx = self.localsplus.stack_len() - depth as usize - 1; + match self.localsplus.stack_index(idx) { Some(obj) => obj.as_object(), None => unsafe { core::hint::unreachable_unchecked() }, } @@ -7870,8 +8084,8 @@ impl fmt::Debug for Frame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // SAFETY: Debug is best-effort; concurrent mutation is unlikely // and would only affect debug output. - let stack = unsafe { &*self.stack.get() }; - let stack_str = stack.iter().fold(String::new(), |mut s, slot| { + let localsplus = unsafe { &*self.localsplus.get() }; + let stack_str = localsplus.stack_as_slice().iter().fold(String::new(), |mut s, slot| { match slot { Some(elem) if elem.downcastable::() => { s.push_str("\n > {frame}"); From 9dbe95d7ea80f8fa9e39299c9bdd3d4473ac1971 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Tue, 3 Mar 2026 22:26:56 +0900 Subject: [PATCH 04/14] Use DataStack for LocalsPlus in non-generator function calls Normal function calls now bump-allocate LocalsPlus data from the per-thread DataStack instead of a separate heap allocation. Generator/coroutine frames continue using heap allocation since they outlive the call. On frame exit, data is copied to the heap (materialize_to_heap) to preserve locals for tracebacks, then the DataStack is popped. VirtualMachine.datastack is wrapped in UnsafeCell for interior mutability (safe because frame allocation is single-threaded LIFO). --- crates/vm/src/builtins/function.rs | 22 ++- crates/vm/src/datastack.rs | 8 +- crates/vm/src/frame.rs | 224 +++++++++++++++++++++++------ crates/vm/src/vm/mod.rs | 25 +++- crates/vm/src/vm/thread.rs | 2 +- 5 files changed, 223 insertions(+), 58 deletions(-) diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 3fd7dd5418f..870e1a1e569 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -562,6 +562,7 @@ impl Py { let is_gen = code.flags.contains(bytecode::CodeFlags::GENERATOR); let is_coro = code.flags.contains(bytecode::CodeFlags::COROUTINE); + let use_datastack = !(is_gen || is_coro); // Construct frame: let frame = Frame::new( @@ -570,6 +571,7 @@ impl Py { self.builtins.clone(), self.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(self.to_owned().into()), + use_datastack, vm, ) .into_ref(&vm.ctx); @@ -594,7 +596,16 @@ impl Py { frame.set_generator(&obj); Ok(obj) } - (false, false) => vm.run_frame(frame), + (false, false) => { + let result = vm.run_frame(frame.clone()); + // Release data stack memory after frame execution completes. + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + result + } } } @@ -665,6 +676,7 @@ impl Py { self.builtins.clone(), self.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(self.to_owned().into()), + true, // Always use datastack (invoke_exact_args is never gen/coro) vm, ) .into_ref(&vm.ctx); @@ -686,7 +698,13 @@ impl Py { } } - vm.run_frame(frame) + let result = vm.run_frame(frame.clone()); + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + result } } diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs index 2b2ceed9c23..f492673e6d8 100644 --- a/crates/vm/src/datastack.rs +++ b/crates/vm/src/datastack.rs @@ -101,7 +101,8 @@ impl DataStack { #[inline(never)] fn push_slow(&mut self, aligned_size: usize) -> *mut u8 { let mut chunk_size = MIN_CHUNK_SIZE; - let needed = aligned_size + MINIMUM_OVERHEAD + core::mem::size_of::() + ALIGN; + let needed = + aligned_size + MINIMUM_OVERHEAD + core::mem::size_of::() + ALIGN; while chunk_size < needed { chunk_size *= 2; } @@ -142,10 +143,7 @@ impl DataStack { unsafe fn pop_slow(&mut self, base: *mut u8) { let old_chunk = self.chunk; let prev = unsafe { (*old_chunk).previous }; - debug_assert!( - !prev.is_null(), - "tried to pop past the root chunk" - ); + debug_assert!(!prev.is_null(), "tried to pop past the root chunk"); unsafe { Self::free_chunk(old_chunk) }; self.chunk = prev; self.top = base; diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 4eb554647f4..698e0c4ca90 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -132,35 +132,118 @@ unsafe impl Sync for FrameUnsafeCell {} /// (niche optimization on NonNull / NonZeroUsize). The raw storage is /// `usize` to unify them; typed access is provided through methods. pub struct LocalsPlus { - /// Raw storage. `0` represents `None` for both types. - data: Box<[usize]>, + /// Backing storage. + data: LocalsPlusData, /// Number of fastlocals slots (nlocals + ncells + nfrees). nlocalsplus: u32, /// Current evaluation stack depth. stack_top: u32, } +enum LocalsPlusData { + /// Heap-allocated storage (generators, coroutines, exec/eval frames). + Heap(Box<[usize]>), + /// Data stack allocated storage (normal function calls). + /// The pointer is valid while the enclosing data stack frame is alive. + DataStack { ptr: *mut usize, capacity: usize }, +} + +// SAFETY: DataStack variant points to thread-local DataStack memory. +// Frame execution is single-threaded (enforced by owner field). +#[cfg(feature = "threading")] +unsafe impl Send for LocalsPlusData {} +#[cfg(feature = "threading")] +unsafe impl Sync for LocalsPlusData {} + const _: () = { assert!(core::mem::size_of::>() == core::mem::size_of::()); // PyStackRef size is checked in object/core.rs }; impl LocalsPlus { - /// Create a new LocalsPlus with `nlocalsplus` fastlocals slots - /// and `stacksize` evaluation stack capacity. All slots start as None (0). + /// Create a new heap-backed LocalsPlus. All slots start as None (0). fn new(nlocalsplus: usize, stacksize: usize) -> Self { let capacity = nlocalsplus + stacksize; Self { - data: vec![0usize; capacity].into_boxed_slice(), + data: LocalsPlusData::Heap(vec![0usize; capacity].into_boxed_slice()), + nlocalsplus: nlocalsplus as u32, + stack_top: 0, + } + } + + /// Create a new LocalsPlus backed by the thread data stack. + /// All slots are zero-initialized. + /// + /// The caller must call `release_datastack()` when the frame finishes + /// to drop values and return the base pointer for `DataStack::pop()`. + fn new_on_datastack(nlocalsplus: usize, stacksize: usize, vm: &VirtualMachine) -> Self { + let capacity = nlocalsplus + stacksize; + let byte_size = capacity * core::mem::size_of::(); + let ptr = vm.datastack_push(byte_size) as *mut usize; + // Zero-initialize all slots (0 = None for both PyObjectRef and PyStackRef). + unsafe { core::ptr::write_bytes(ptr, 0, capacity) }; + Self { + data: LocalsPlusData::DataStack { ptr, capacity }, nlocalsplus: nlocalsplus as u32, stack_top: 0, } } + /// Migrate data-stack-backed storage to the heap, preserving all values. + /// Returns the data stack base pointer for `DataStack::pop()`. + /// Returns `None` if already heap-backed. + fn materialize_to_heap(&mut self) -> Option<*mut u8> { + if let LocalsPlusData::DataStack { ptr, capacity } = &self.data { + let base = *ptr as *mut u8; + // Copy all data to a heap allocation (preserves locals for tracebacks). + let heap_data = unsafe { core::slice::from_raw_parts(*ptr, *capacity) } + .to_vec() + .into_boxed_slice(); + self.data = LocalsPlusData::Heap(heap_data); + Some(base) + } else { + None + } + } + + /// Drop all contained values without freeing the backing storage. + fn drop_values(&mut self) { + self.stack_clear(); + let fastlocals = self.fastlocals_mut(); + for slot in fastlocals.iter_mut() { + let _ = slot.take(); + } + } + + // -- Data access helpers -- + + #[inline(always)] + fn data_as_slice(&self) -> &[usize] { + match &self.data { + LocalsPlusData::Heap(b) => b, + LocalsPlusData::DataStack { ptr, capacity } => unsafe { + core::slice::from_raw_parts(*ptr, *capacity) + }, + } + } + + #[inline(always)] + fn data_as_mut_slice(&mut self) -> &mut [usize] { + match &mut self.data { + LocalsPlusData::Heap(b) => b, + LocalsPlusData::DataStack { ptr, capacity } => unsafe { + core::slice::from_raw_parts_mut(*ptr, *capacity) + }, + } + } + /// Total capacity (fastlocals + stack). #[inline(always)] fn capacity(&self) -> usize { - self.data.len() + match &self.data { + LocalsPlusData::Heap(b) => b.len(), + LocalsPlusData::DataStack { capacity, .. } => *capacity, + } } /// Stack capacity (max stack depth). @@ -174,15 +257,18 @@ impl LocalsPlus { /// Immutable access to fastlocals as `Option` slice. #[inline(always)] fn fastlocals(&self) -> &[Option] { - let ptr = self.data.as_ptr() as *const Option; + let data = self.data_as_slice(); + let ptr = data.as_ptr() as *const Option; unsafe { core::slice::from_raw_parts(ptr, self.nlocalsplus as usize) } } /// Mutable access to fastlocals as `Option` slice. #[inline(always)] fn fastlocals_mut(&mut self) -> &mut [Option] { - let ptr = self.data.as_mut_ptr() as *mut Option; - unsafe { core::slice::from_raw_parts_mut(ptr, self.nlocalsplus as usize) } + let nlocalsplus = self.nlocalsplus as usize; + let data = self.data_as_mut_slice(); + let ptr = data.as_mut_ptr() as *mut Option; + unsafe { core::slice::from_raw_parts_mut(ptr, nlocalsplus) } } // -- Stack access -- @@ -203,10 +289,14 @@ impl LocalsPlus { #[inline(always)] fn stack_push(&mut self, val: Option) { let idx = self.nlocalsplus as usize + self.stack_top as usize; - debug_assert!(idx < self.capacity(), "stack overflow: stack_top={}, capacity={}", self.stack_top, self.stack_capacity()); - // SAFETY: Option is usize-sized. We transfer ownership - // by writing the raw bits and forgetting the original value. - self.data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + debug_assert!( + idx < self.capacity(), + "stack overflow: stack_top={}, capacity={}", + self.stack_top, + self.stack_capacity() + ); + let data = self.data_as_mut_slice(); + data[idx] = unsafe { core::mem::transmute::, usize>(val) }; self.stack_top += 1; } @@ -217,7 +307,8 @@ impl LocalsPlus { if idx >= self.capacity() { return Err(val); } - self.data[idx] = unsafe { core::mem::transmute::, usize>(val) }; + let data = self.data_as_mut_slice(); + data[idx] = unsafe { core::mem::transmute::, usize>(val) }; self.stack_top += 1; Ok(()) } @@ -228,15 +319,17 @@ impl LocalsPlus { debug_assert!(self.stack_top > 0, "stack underflow"); self.stack_top -= 1; let idx = self.nlocalsplus as usize + self.stack_top as usize; - let raw = core::mem::replace(&mut self.data[idx], 0); + let data = self.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); unsafe { core::mem::transmute::>(raw) } } /// Immutable view of the active stack as `Option` slice. #[inline(always)] fn stack_as_slice(&self) -> &[Option] { + let data = self.data_as_slice(); let base = self.nlocalsplus as usize; - let ptr = unsafe { (self.data.as_ptr().add(base)) as *const Option }; + let ptr = unsafe { (data.as_ptr().add(base)) as *const Option }; unsafe { core::slice::from_raw_parts(ptr, self.stack_top as usize) } } @@ -244,8 +337,9 @@ impl LocalsPlus { #[inline(always)] fn stack_index(&self, idx: usize) -> &Option { debug_assert!(idx < self.stack_top as usize); + let data = self.data_as_slice(); let raw_idx = self.nlocalsplus as usize + idx; - unsafe { &*(self.data.as_ptr().add(raw_idx) as *const Option) } + unsafe { &*(data.as_ptr().add(raw_idx) as *const Option) } } /// Get a mutable reference to a stack slot by index from the bottom. @@ -253,7 +347,8 @@ impl LocalsPlus { fn stack_index_mut(&mut self, idx: usize) -> &mut Option { debug_assert!(idx < self.stack_top as usize); let raw_idx = self.nlocalsplus as usize + idx; - unsafe { &mut *(self.data.as_mut_ptr().add(raw_idx) as *mut Option) } + let data = self.data_as_mut_slice(); + unsafe { &mut *(data.as_mut_ptr().add(raw_idx) as *mut Option) } } /// Get the last stack element (top of stack). @@ -281,7 +376,8 @@ impl LocalsPlus { #[inline(always)] fn stack_swap(&mut self, a: usize, b: usize) { let base = self.nlocalsplus as usize; - self.data.swap(base + a, base + b); + let data = self.data_as_mut_slice(); + data.swap(base + a, base + b); } /// Clear the stack, dropping all values. @@ -293,7 +389,10 @@ impl LocalsPlus { /// Drain stack elements from `from` to the end, returning an iterator /// that yields `Option` in forward order and shrinks the stack. - fn stack_drain(&mut self, from: usize) -> impl ExactSizeIterator> + '_ { + fn stack_drain( + &mut self, + from: usize, + ) -> impl ExactSizeIterator> + '_ { let end = self.stack_top as usize; debug_assert!(from <= end); // Reduce stack_top now; the drain iterator owns the elements. @@ -330,7 +429,8 @@ impl Iterator for LocalsPlusStackDrain<'_> { return None; } let idx = self.localsplus.nlocalsplus as usize + self.current; - let raw = core::mem::replace(&mut self.localsplus.data[idx], 0); + let data = self.localsplus.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); self.current += 1; Some(unsafe { core::mem::transmute::>(raw) }) } @@ -348,7 +448,8 @@ impl Drop for LocalsPlusStackDrain<'_> { // Drop any remaining elements that weren't consumed. while self.current < self.end { let idx = self.localsplus.nlocalsplus as usize + self.current; - let raw = core::mem::replace(&mut self.localsplus.data[idx], 0); + let data = self.localsplus.data_as_mut_slice(); + let raw = core::mem::replace(&mut data[idx], 0); let _ = unsafe { core::mem::transmute::>(raw) }; self.current += 1; } @@ -357,13 +458,11 @@ impl Drop for LocalsPlusStackDrain<'_> { impl Drop for LocalsPlus { fn drop(&mut self) { - // Drop active stack entries as Option. - self.stack_clear(); - // Drop fastlocals entries as Option. - let fastlocals = self.fastlocals_mut(); - for slot in fastlocals.iter_mut() { - let _ = slot.take(); - } + // drop_values handles both stack and fastlocals. + // For DataStack-backed storage, the caller should have called + // release_datastack() before drop. If not (e.g. panic), the + // DataStack memory is leaked but values are still dropped safely. + self.drop_values(); } } @@ -542,6 +641,7 @@ impl Frame { builtins: PyObjectRef, closure: &[PyCellRef], func_obj: Option, + use_datastack: bool, vm: &VirtualMachine, ) -> Self { let nlocals = code.varnames.len(); @@ -557,7 +657,11 @@ impl Frame { // Create unified localsplus: varnames + cellvars + freevars + stack let nlocalsplus = nlocals + num_cells + nfrees; let max_stackdepth = code.max_stackdepth as usize; - let mut localsplus = LocalsPlus::new(nlocalsplus, max_stackdepth); + let mut localsplus = if use_datastack { + LocalsPlus::new_on_datastack(nlocalsplus, max_stackdepth, vm) + } else { + LocalsPlus::new(nlocalsplus, max_stackdepth) + }; // Store cell objects at cellvars and freevars positions for (i, cell) in cells_frees.iter().enumerate() { @@ -613,6 +717,17 @@ impl Frame { unsafe { (*self.localsplus.get()).fastlocals_mut() } } + /// Migrate data-stack-backed storage to the heap, preserving all values, + /// and return the data stack base pointer for `DataStack::pop()`. + /// Returns `None` if already heap-backed. + /// + /// # Safety + /// Caller must ensure the frame is not executing and the returned + /// pointer is passed to `VirtualMachine::datastack_pop()`. + pub(crate) unsafe fn materialize_localsplus(&self) -> Option<*mut u8> { + unsafe { (*self.localsplus.get()).materialize_to_heap() } + } + /// Clear evaluation stack and state-owned cell/free references. /// For full local/cell cleanup, call `clear_locals_and_stack()`. pub(crate) fn clear_stack_and_cells(&self) { @@ -4135,7 +4250,10 @@ impl ExecutingFrame<'_> { let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; let stack_len = self.localsplus.stack_len(); - let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -4247,7 +4365,10 @@ impl ExecutingFrame<'_> { let callable = self.nth_value(nargs + 1); let callable_tag = callable as *const PyObject as u32; let stack_len = self.localsplus.stack_len(); - let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let func = if cached_tag == callable_tag && self_or_null_is_some { callable .downcast_ref::() @@ -7104,7 +7225,10 @@ impl ExecutingFrame<'_> { // callable is at position nargs + 1 from top // self_or_null is at position nargs from top let stack_len = self.localsplus.stack_len(); - let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); let callable = self.nth_value(nargs + 1); if let Some(func) = callable.downcast_ref::() { @@ -7275,7 +7399,10 @@ impl ExecutingFrame<'_> { // Stack: [callable, self_or_null, arg1, ..., argN, kwarg_names] // callable is at position nargs + 2 from top let stack_len = self.localsplus.stack_len(); - let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 2).is_some(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 2) + .is_some(); let callable = self.nth_value(nargs + 2); if let Some(func) = callable.downcast_ref::() { @@ -8085,20 +8212,23 @@ impl fmt::Debug for Frame { // SAFETY: Debug is best-effort; concurrent mutation is unlikely // and would only affect debug output. let localsplus = unsafe { &*self.localsplus.get() }; - let stack_str = localsplus.stack_as_slice().iter().fold(String::new(), |mut s, slot| { - match slot { - Some(elem) if elem.downcastable::() => { - s.push_str("\n > {frame}"); - } - Some(elem) => { - core::fmt::write(&mut s, format_args!("\n > {elem:?}")).unwrap(); - } - None => { - s.push_str("\n > NULL"); + let stack_str = localsplus + .stack_as_slice() + .iter() + .fold(String::new(), |mut s, slot| { + match slot { + Some(elem) if elem.downcastable::() => { + s.push_str("\n > {frame}"); + } + Some(elem) => { + core::fmt::write(&mut s, format_args!("\n > {elem:?}")).unwrap(); + } + None => { + s.push_str("\n > NULL"); + } } - } - s - }); + s + }); // TODO: fix this up write!( f, diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index 4b3cd880db6..ef65bf59d65 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -76,7 +76,7 @@ pub struct VirtualMachine { pub frames: RefCell>, /// Thread-local data stack for bump-allocating frame-local data /// (localsplus arrays for non-generator frames). - pub datastack: crate::datastack::DataStack, + datastack: core::cell::UnsafeCell, pub wasm_id: Option, exceptions: RefCell, pub import_func: PyObjectRef, @@ -175,6 +175,25 @@ pub fn process_hash_secret_seed() -> u32 { } impl VirtualMachine { + /// Bump-allocate `size` bytes from the thread data stack. + /// + /// # Safety + /// The returned pointer must be freed by calling `datastack_pop` in LIFO order. + #[inline(always)] + pub(crate) fn datastack_push(&self, size: usize) -> *mut u8 { + unsafe { (*self.datastack.get()).push(size) } + } + + /// Pop a previous data stack allocation. + /// + /// # Safety + /// `base` must be a pointer returned by `datastack_push` on this VM, + /// and all allocations made after it must already have been popped. + #[inline(always)] + pub(crate) unsafe fn datastack_pop(&self, base: *mut u8) { + unsafe { (*self.datastack.get()).pop(base) } + } + /// Check whether the current thread is the main thread. /// Mirrors `_Py_ThreadCanHandleSignals`. #[allow(dead_code)] @@ -218,7 +237,7 @@ impl VirtualMachine { sys_module, ctx, frames: RefCell::new(vec![]), - datastack: crate::datastack::DataStack::new(), + datastack: core::cell::UnsafeCell::new(crate::datastack::DataStack::new()), wasm_id: None, exceptions: RefCell::default(), import_func, @@ -595,7 +614,7 @@ impl VirtualMachine { }; let frame = - Frame::new(code, scope, builtins, &[], Some(func_obj), self).into_ref(&self.ctx); + Frame::new(code, scope, builtins, &[], Some(func_obj), false, self).into_ref(&self.ctx); self.run_frame(frame) } diff --git a/crates/vm/src/vm/thread.rs b/crates/vm/src/vm/thread.rs index 1dee8bba285..8dd8e0312ee 100644 --- a/crates/vm/src/vm/thread.rs +++ b/crates/vm/src/vm/thread.rs @@ -309,7 +309,7 @@ impl VirtualMachine { sys_module: self.sys_module.clone(), ctx: self.ctx.clone(), frames: RefCell::new(vec![]), - datastack: crate::datastack::DataStack::new(), + datastack: core::cell::UnsafeCell::new(crate::datastack::DataStack::new()), wasm_id: self.wasm_id.clone(), exceptions: RefCell::default(), import_func: self.import_func.clone(), From a5981f0b226ae0921765646f8224a4ce81ab1c1d Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Tue, 3 Mar 2026 23:42:50 +0900 Subject: [PATCH 05/14] Fix clippy: import Layout from core::alloc instead of alloc::alloc --- crates/vm/src/datastack.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs index f492673e6d8..7ede718057f 100644 --- a/crates/vm/src/datastack.rs +++ b/crates/vm/src/datastack.rs @@ -6,7 +6,8 @@ /// /// Normal function calls allocate from the data stack via `push()` (pointer /// bump). Generators and coroutines use heap-allocated storage instead. -use alloc::alloc::{Layout, alloc, dealloc}; +use alloc::alloc::{alloc, dealloc}; +use core::alloc::Layout; use core::ptr; /// Minimum chunk size in bytes (16 KB, matching CPython `_PY_DATA_STACK_CHUNK_SIZE`). From 938fdf72e0c35ea74f0ea716b8296f2e6bc6cd0c Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 00:17:52 +0900 Subject: [PATCH 06/14] Fix vectorcall compatibility with LocalsPlus API Update vectorcall dispatch functions to use localsplus stack accessors instead of direct stack field access. Add stack_truncate method to LocalsPlus. Update vectorcall fast path in function.rs to use datastack and fastlocals_mut(). --- crates/vm/src/builtins/function.rs | 13 +++++++++--- crates/vm/src/frame.rs | 33 ++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 870e1a1e569..1316dd7b725 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -1309,26 +1309,33 @@ pub(crate) fn vectorcall_function( zelf.builtins.clone(), zelf.closure.as_ref().map_or(&[], |c| c.as_slice()), Some(zelf.to_owned().into()), + true, // Always use datastack (is_simple excludes gen/coro) vm, ) .into_ref(&vm.ctx); { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..nargs)) { *slot = Some(arg); } } if let Some(cell2arg) = code.cell2arg.as_deref() { - let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + let fastlocals = unsafe { frame.fastlocals_mut() }; for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { let x = fastlocals[*arg_idx as usize].take(); frame.set_cell_contents(cell_idx, x); } } - return vm.run_frame(frame); + let result = vm.run_frame(frame.clone()); + unsafe { + if let Some(base) = frame.materialize_localsplus() { + vm.datastack_pop(base); + } + } + return result; } // SLOW PATH: construct FuncArgs from owned Vec and delegate to invoke() diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 698e0c4ca90..3845a56b488 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -380,6 +380,14 @@ impl LocalsPlus { data.swap(base + a, base + b); } + /// Truncate the stack to `new_len` elements, dropping excess values. + fn stack_truncate(&mut self, new_len: usize) { + debug_assert!(new_len <= self.stack_top as usize); + while self.stack_top as usize > new_len { + let _ = self.stack_pop(); + } + } + /// Clear the stack, dropping all values. fn stack_clear(&mut self) { while self.stack_top > 0 { @@ -5959,7 +5967,7 @@ impl ExecutingFrame<'_> { #[inline] fn execute_call_vectorcall(&mut self, nargs: u32, vm: &VirtualMachine) -> FrameResult { let nargs_usize = nargs as usize; - let stack_len = self.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -5969,7 +5977,9 @@ impl ExecutingFrame<'_> { let args_start = stack_len - nargs_usize; // Build args: [self?, arg1, ..., argN] - let self_or_null = self.stack[self_or_null_idx] + let self_or_null = self + .localsplus + .stack_index_mut(self_or_null_idx) .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -5984,12 +5994,12 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self.localsplus.stack_index_mut(stack_idx).take().unwrap().to_pyobj(); args_vec.push(val); } - let callable_obj = self.stack[callable_idx].take().unwrap().to_pyobj(); - self.stack.truncate(callable_idx); + let callable_obj = self.localsplus.stack_index_mut(callable_idx).take().unwrap().to_pyobj(); + self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let result = callable_obj.vectorcall(args_vec, effective_nargs, None, vm)?; @@ -6010,7 +6020,7 @@ impl ExecutingFrame<'_> { let kw_count = kwarg_names_tuple.len(); debug_assert!(kw_count <= nargs_usize, "CALL_KW kw_count exceeds nargs"); - let stack_len = self.stack.len(); + let stack_len = self.localsplus.stack_len(); debug_assert!( stack_len >= nargs_usize + 2, "CALL_KW stack underflow: need callable + self_or_null + {nargs_usize} args, have {stack_len}" @@ -6019,9 +6029,10 @@ impl ExecutingFrame<'_> { let self_or_null_idx = stack_len - nargs_usize - 1; let args_start = stack_len - nargs_usize; - // Build args: [self?, pos_arg1, ..., pos_argM, kw_val1, ..., kw_valK] - let self_or_null = self.stack[self_or_null_idx] + let self_or_null = self + .localsplus + .stack_index_mut(self_or_null_idx) .take() .map(|sr| sr.to_pyobj()); let has_self = self_or_null.is_some(); @@ -6038,12 +6049,12 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.stack[stack_idx].take().unwrap().to_pyobj(); + let val = self.localsplus.stack_index_mut(stack_idx).take().unwrap().to_pyobj(); args_vec.push(val); } - let callable_obj = self.stack[callable_idx].take().unwrap().to_pyobj(); - self.stack.truncate(callable_idx); + let callable_obj = self.localsplus.stack_index_mut(callable_idx).take().unwrap().to_pyobj(); + self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot let kwnames = kwarg_names_tuple.as_slice(); From fa49f5daec75bda10337a64f527309e0f9e4641c Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 04:30:33 +0900 Subject: [PATCH 07/14] Add datastack, nlocalsplus, ncells, tstate to cspell dictionary --- .cspell.dict/cpython.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt index a35c7c8834e..b8081e25a9b 100644 --- a/.cspell.dict/cpython.txt +++ b/.cspell.dict/cpython.txt @@ -43,6 +43,7 @@ CONVFUNC convparam copyslot cpucount +datastack defaultdict denom deopt @@ -118,6 +119,7 @@ mult multibytecodec nameobj nameop +ncells nconsts newargs newfree @@ -125,6 +127,7 @@ NEWLOCALS newsemlockobject nfrees nkwargs +nlocalsplus nkwelts Nondescriptor noninteger @@ -192,6 +195,7 @@ testconsole ticketer tmptype tok_oldval +tstate tvars typeobject typeparam From 6a6e90da7e74010857e92e50d3bc2d7bd508ac3c Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 13:52:29 +0900 Subject: [PATCH 08/14] Fix DataStack pop() for non-monotonic allocation addresses Check both bounds of the current chunk when determining if a pop base is in the current chunk. The previous check (base >= chunk_start) fails on Windows where newer chunks may be allocated at lower addresses than older ones. --- crates/vm/src/datastack.rs | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs index 7ede718057f..bc05b6b89bd 100644 --- a/crates/vm/src/datastack.rs +++ b/crates/vm/src/datastack.rs @@ -128,8 +128,7 @@ impl DataStack { #[inline(always)] pub unsafe fn pop(&mut self, base: *mut u8) { debug_assert!(!base.is_null()); - let chunk_start = unsafe { (*self.chunk).data_start() }; - if base >= chunk_start { + if self.is_in_current_chunk(base) { // Common case: base is within the current chunk. self.top = base; } else { @@ -138,23 +137,31 @@ impl DataStack { } } + /// Check if `ptr` falls within the current chunk's data area. + /// Both bounds are checked to handle non-monotonic allocation addresses + /// (e.g. on Windows where newer chunks may be at lower addresses). + #[inline(always)] + fn is_in_current_chunk(&self, ptr: *mut u8) -> bool { + let chunk_start = unsafe { (*self.chunk).data_start() }; + ptr >= chunk_start && ptr <= self.limit + } + /// Slow path: pop back to a previous chunk. #[cold] #[inline(never)] unsafe fn pop_slow(&mut self, base: *mut u8) { - let old_chunk = self.chunk; - let prev = unsafe { (*old_chunk).previous }; - debug_assert!(!prev.is_null(), "tried to pop past the root chunk"); - unsafe { Self::free_chunk(old_chunk) }; - self.chunk = prev; - self.top = base; - // If the base is not within this previous chunk either, we have - // a logic error (LIFO violation). - debug_assert!( - base >= unsafe { (*prev).data_start() } && base <= unsafe { (*prev).data_limit() }, - "pop base not in previous chunk (LIFO violation)" - ); - self.limit = unsafe { (*prev).data_limit() }; + loop { + let old_chunk = self.chunk; + let prev = unsafe { (*old_chunk).previous }; + debug_assert!(!prev.is_null(), "tried to pop past the root chunk"); + unsafe { Self::free_chunk(old_chunk) }; + self.chunk = prev; + self.limit = unsafe { (*prev).data_limit() }; + if self.is_in_current_chunk(base) { + self.top = base; + return; + } + } } /// Allocate a new chunk. From 2cc6b583ad598fec4d2d783e299e9447b9cd1ddb Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 13:57:36 +0900 Subject: [PATCH 09/14] Fix stale comments: release_datastack -> materialize_localsplus --- crates/vm/src/frame.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 3845a56b488..a4adb95acb2 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -67,7 +67,7 @@ enum UnwindReason { } // FrameState fields are stored directly on Frame as UnsafeCell. -// See Frame.stack, Frame.cells_frees, Frame.prev_line. +// See Frame.localsplus, Frame.cells_frees, Frame.prev_line. /// Tracks who owns a frame. // = `_PyFrameOwner` @@ -174,8 +174,8 @@ impl LocalsPlus { /// Create a new LocalsPlus backed by the thread data stack. /// All slots are zero-initialized. /// - /// The caller must call `release_datastack()` when the frame finishes - /// to drop values and return the base pointer for `DataStack::pop()`. + /// The caller must call `materialize_localsplus()` when the frame finishes + /// to migrate data to the heap, then `datastack_pop()` to free the memory. fn new_on_datastack(nlocalsplus: usize, stacksize: usize, vm: &VirtualMachine) -> Self { let capacity = nlocalsplus + stacksize; let byte_size = capacity * core::mem::size_of::(); @@ -468,8 +468,9 @@ impl Drop for LocalsPlus { fn drop(&mut self) { // drop_values handles both stack and fastlocals. // For DataStack-backed storage, the caller should have called - // release_datastack() before drop. If not (e.g. panic), the - // DataStack memory is leaked but values are still dropped safely. + // materialize_localsplus() + datastack_pop() before drop. + // If not (e.g. panic), the DataStack memory is leaked but + // values are still dropped safely. self.drop_values(); } } From f300ff313083635786a24aeac84cd736eca4290f Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 14:53:44 +0900 Subject: [PATCH 10/14] Fix non-threading mode for parallel test execution Two fixes for Cell-based types used in static items under non-threading mode, which cause data races when Rust test runner uses parallel threads: 1. LazyLock: use std::sync::LazyLock when std is available instead of wrapping core::cell::LazyCell with a false `unsafe impl Sync`. The LazyCell wrapper is kept only for no-std (truly single-threaded). 2. gc_state: use static_cell! (thread-local in non-threading mode) instead of OnceLock, so each thread gets its own GcState with Cell-based PyRwLock/PyMutex that are not accessed concurrently. --- crates/common/src/lock.rs | 16 ++++++++++++---- crates/vm/src/gc_state.rs | 22 ++++++++++++---------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/crates/common/src/lock.rs b/crates/common/src/lock.rs index f1d00627e90..49cfb0393c3 100644 --- a/crates/common/src/lock.rs +++ b/crates/common/src/lock.rs @@ -10,18 +10,26 @@ cfg_if::cfg_if! { if #[cfg(feature = "threading")] { pub use parking_lot::{RawMutex, RawRwLock, RawThreadId}; - pub use std::sync::{LazyLock, OnceLock as OnceCell}; + pub use std::sync::OnceLock as OnceCell; pub use core::cell::LazyCell; } else { mod cell_lock; pub use cell_lock::{RawCellMutex as RawMutex, RawCellRwLock as RawRwLock, SingleThreadId as RawThreadId}; pub use core::cell::{LazyCell, OnceCell}; + } +} - /// `core::cell::LazyCell` with `Sync` for use in `static` items. - /// SAFETY: Without threading, there can be no concurrent access. +// LazyLock: thread-safe lazy initialization for `static` items. +// In non-threading mode with std, use std::sync::LazyLock for safety +// (Rust test runner uses parallel threads even without the threading feature). +// Without std, use a LazyCell wrapper (truly single-threaded environments only). +cfg_if::cfg_if! { + if #[cfg(any(feature = "threading", feature = "std"))] { + pub use std::sync::LazyLock; + } else { pub struct LazyLock T>(core::cell::LazyCell); - // SAFETY: Without threading, there can be no concurrent access. + // SAFETY: Without std, there can be no threads. unsafe impl Sync for LazyLock {} impl T> LazyLock { diff --git a/crates/vm/src/gc_state.rs b/crates/vm/src/gc_state.rs index 28cf09f5ff2..2111abca162 100644 --- a/crates/vm/src/gc_state.rs +++ b/crates/vm/src/gc_state.rs @@ -134,10 +134,11 @@ pub struct GcState { finalized_objects: PyRwLock>, } -// SAFETY: All fields are either inherently Send/Sync (atomics, RwLock, Mutex) or protected by PyMutex. -// PyMutex> is safe to share/send across threads because access is synchronized. -// PyObjectRef itself is Send, and interior mutability is guarded by the mutex. +// SAFETY: GcObjectPtr wraps NonNull which is !Send/!Sync, but we only use it as an opaque +// hash key. PyObjectRef is Send. In threading mode, PyRwLock/PyMutex are parking_lot based +// and genuinely Sync. In non-threading mode, gc_state() is thread-local so Sync is not needed. unsafe impl Send for GcState {} +#[cfg(feature = "threading")] unsafe impl Sync for GcState {} impl Default for GcState { @@ -827,14 +828,15 @@ impl GcState { } } -use std::sync::OnceLock; - -/// Global GC state instance -/// Using a static because GC needs to be accessible from object allocation/deallocation -static GC_STATE: OnceLock = OnceLock::new(); - -/// Get a reference to the global GC state +/// Get a reference to the GC state. +/// +/// In threading mode this is a true global (OnceLock). +/// In non-threading mode this is thread-local, because PyRwLock/PyMutex +/// use Cell-based locks that are not Sync. pub fn gc_state() -> &'static GcState { + rustpython_common::static_cell! { + static GC_STATE: GcState; + } GC_STATE.get_or_init(GcState::new) } From 927269c0c15bb48eca32cf4d213803ba801f94b6 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 15:50:12 +0900 Subject: [PATCH 11/14] Fix CallAllocAndEnterInit to use LocalsPlus stack API --- crates/vm/src/frame.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index a4adb95acb2..d728fa0e892 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -4311,9 +4311,8 @@ impl ExecutingFrame<'_> { let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); - let stack = &self.state.stack; - let stack_len = stack.len(); - let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let stack_len = self.localsplus.stack_len(); + let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); if !self_or_null_is_some && cached_version != 0 && let Some(cls) = callable.downcast_ref::() From e8d94bb74e58c2bab2d47f023374202cbb019b7c Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 16:21:22 +0900 Subject: [PATCH 12/14] Use checked arithmetic in LocalsPlus and DataStack allocators --- crates/common/src/lock.rs | 5 ++- crates/vm/src/datastack.rs | 11 +++-- crates/vm/src/frame.rs | 91 ++++++++++++++++++++++++-------------- 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/crates/common/src/lock.rs b/crates/common/src/lock.rs index 49cfb0393c3..dc8c5856626 100644 --- a/crates/common/src/lock.rs +++ b/crates/common/src/lock.rs @@ -29,7 +29,10 @@ cfg_if::cfg_if! { pub use std::sync::LazyLock; } else { pub struct LazyLock T>(core::cell::LazyCell); - // SAFETY: Without std, there can be no threads. + // SAFETY: This branch is only active when both "std" and "threading" + // features are absent — i.e., truly single-threaded no_std environments + // (e.g., embedded or bare-metal WASM). Without std, the Rust runtime + // cannot spawn threads, so Sync is trivially satisfied. unsafe impl Sync for LazyLock {} impl T> LazyLock { diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs index bc05b6b89bd..194c274e3d2 100644 --- a/crates/vm/src/datastack.rs +++ b/crates/vm/src/datastack.rs @@ -102,10 +102,15 @@ impl DataStack { #[inline(never)] fn push_slow(&mut self, aligned_size: usize) -> *mut u8 { let mut chunk_size = MIN_CHUNK_SIZE; - let needed = - aligned_size + MINIMUM_OVERHEAD + core::mem::size_of::() + ALIGN; + let needed = aligned_size + .checked_add(MINIMUM_OVERHEAD) + .and_then(|v| v.checked_add(core::mem::size_of::())) + .and_then(|v| v.checked_add(ALIGN)) + .expect("DataStack chunk size overflow"); while chunk_size < needed { - chunk_size *= 2; + chunk_size = chunk_size + .checked_mul(2) + .expect("DataStack chunk size overflow"); } // Save current position in old chunk. unsafe { diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index d728fa0e892..228575d0e95 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -163,10 +163,13 @@ const _: () = { impl LocalsPlus { /// Create a new heap-backed LocalsPlus. All slots start as None (0). fn new(nlocalsplus: usize, stacksize: usize) -> Self { - let capacity = nlocalsplus + stacksize; + let capacity = nlocalsplus + .checked_add(stacksize) + .expect("LocalsPlus capacity overflow"); + let nlocalsplus_u32 = u32::try_from(nlocalsplus).expect("nlocalsplus exceeds u32"); Self { data: LocalsPlusData::Heap(vec![0usize; capacity].into_boxed_slice()), - nlocalsplus: nlocalsplus as u32, + nlocalsplus: nlocalsplus_u32, stack_top: 0, } } @@ -177,14 +180,19 @@ impl LocalsPlus { /// The caller must call `materialize_localsplus()` when the frame finishes /// to migrate data to the heap, then `datastack_pop()` to free the memory. fn new_on_datastack(nlocalsplus: usize, stacksize: usize, vm: &VirtualMachine) -> Self { - let capacity = nlocalsplus + stacksize; - let byte_size = capacity * core::mem::size_of::(); + let capacity = nlocalsplus + .checked_add(stacksize) + .expect("LocalsPlus capacity overflow"); + let byte_size = capacity + .checked_mul(core::mem::size_of::()) + .expect("LocalsPlus byte size overflow"); + let nlocalsplus_u32 = u32::try_from(nlocalsplus).expect("nlocalsplus exceeds u32"); let ptr = vm.datastack_push(byte_size) as *mut usize; // Zero-initialize all slots (0 = None for both PyObjectRef and PyStackRef). unsafe { core::ptr::write_bytes(ptr, 0, capacity) }; Self { data: LocalsPlusData::DataStack { ptr, capacity }, - nlocalsplus: nlocalsplus as u32, + nlocalsplus: nlocalsplus_u32, stack_top: 0, } } @@ -2362,18 +2370,16 @@ impl ExecutingFrame<'_> { // Same as LoadFast but explicitly checks for unbound locals // (LoadFast in RustPython already does this check) let idx = idx.get(arg) as usize; - let x = self.localsplus.fastlocals()[idx] - .clone() - .ok_or_else(|| { - vm.new_exception_msg( - vm.ctx.exceptions.unbound_local_error.to_owned(), - format!( - "local variable '{}' referenced before assignment", - self.code.varnames[idx] - ) - .into(), + let x = self.localsplus.fastlocals()[idx].clone().ok_or_else(|| { + vm.new_exception_msg( + vm.ctx.exceptions.unbound_local_error.to_owned(), + format!( + "local variable '{}' referenced before assignment", + self.code.varnames[idx] ) - })?; + .into(), + ) + })?; self.push_value(x); Ok(None) } @@ -2413,18 +2419,16 @@ impl ExecutingFrame<'_> { // lifetime issues at yield/exception points are resolved. Instruction::LoadFastBorrow { var_num: idx } => { let idx = idx.get(arg) as usize; - let x = self.localsplus.fastlocals()[idx] - .clone() - .ok_or_else(|| { - vm.new_exception_msg( - vm.ctx.exceptions.unbound_local_error.to_owned(), - format!( - "local variable '{}' referenced before assignment", - self.code.varnames[idx] - ) - .into(), + let x = self.localsplus.fastlocals()[idx].clone().ok_or_else(|| { + vm.new_exception_msg( + vm.ctx.exceptions.unbound_local_error.to_owned(), + format!( + "local variable '{}' referenced before assignment", + self.code.varnames[idx] ) - })?; + .into(), + ) + })?; self.push_value(x); Ok(None) } @@ -4312,7 +4316,10 @@ impl ExecutingFrame<'_> { let nargs: u32 = arg.into(); let callable = self.nth_value(nargs + 1); let stack_len = self.localsplus.stack_len(); - let self_or_null_is_some = self.localsplus.stack_index(stack_len - nargs as usize - 1).is_some(); + let self_or_null_is_some = self + .localsplus + .stack_index(stack_len - nargs as usize - 1) + .is_some(); if !self_or_null_is_some && cached_version != 0 && let Some(cls) = callable.downcast_ref::() @@ -5994,11 +6001,21 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.localsplus.stack_index_mut(stack_idx).take().unwrap().to_pyobj(); + let val = self + .localsplus + .stack_index_mut(stack_idx) + .take() + .unwrap() + .to_pyobj(); args_vec.push(val); } - let callable_obj = self.localsplus.stack_index_mut(callable_idx).take().unwrap().to_pyobj(); + let callable_obj = self + .localsplus + .stack_index_mut(callable_idx) + .take() + .unwrap() + .to_pyobj(); self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot @@ -6049,11 +6066,21 @@ impl ExecutingFrame<'_> { args_vec.push(self_val); } for stack_idx in args_start..stack_len { - let val = self.localsplus.stack_index_mut(stack_idx).take().unwrap().to_pyobj(); + let val = self + .localsplus + .stack_index_mut(stack_idx) + .take() + .unwrap() + .to_pyobj(); args_vec.push(val); } - let callable_obj = self.localsplus.stack_index_mut(callable_idx).take().unwrap().to_pyobj(); + let callable_obj = self + .localsplus + .stack_index_mut(callable_idx) + .take() + .unwrap() + .to_pyobj(); self.localsplus.stack_truncate(callable_idx); // invoke_vectorcall falls back to FuncArgs if no vectorcall slot From 16778fc435163126c06f287a70bb645f94ffb8f1 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 17:23:47 +0900 Subject: [PATCH 13/14] Address code review: checked arithmetic, threading feature deps, Send gate - Use checked arithmetic for nlocalsplus in Frame::new - Add "std" to threading feature dependencies in rustpython-common - Gate GcState Send impl with #[cfg(feature = "threading")] --- crates/common/Cargo.toml | 2 +- crates/vm/src/frame.rs | 5 ++++- crates/vm/src/gc_state.rs | 4 +++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 054e52ae81a..555336f059a 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -11,7 +11,7 @@ license.workspace = true [features] default = ["std"] std = [] -threading = ["parking_lot"] +threading = ["parking_lot", "std"] wasm_js = ["getrandom/wasm_js"] [dependencies] diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 228575d0e95..9f40352e4c7 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -672,7 +672,10 @@ impl Frame { .collect(); // Create unified localsplus: varnames + cellvars + freevars + stack - let nlocalsplus = nlocals + num_cells + nfrees; + let nlocalsplus = nlocals + .checked_add(num_cells) + .and_then(|v| v.checked_add(nfrees)) + .expect("Frame::new: nlocalsplus overflow"); let max_stackdepth = code.max_stackdepth as usize; let mut localsplus = if use_datastack { LocalsPlus::new_on_datastack(nlocalsplus, max_stackdepth, vm) diff --git a/crates/vm/src/gc_state.rs b/crates/vm/src/gc_state.rs index 2111abca162..a25c8d6c42a 100644 --- a/crates/vm/src/gc_state.rs +++ b/crates/vm/src/gc_state.rs @@ -136,7 +136,9 @@ pub struct GcState { // SAFETY: GcObjectPtr wraps NonNull which is !Send/!Sync, but we only use it as an opaque // hash key. PyObjectRef is Send. In threading mode, PyRwLock/PyMutex are parking_lot based -// and genuinely Sync. In non-threading mode, gc_state() is thread-local so Sync is not needed. +// and genuinely Sync. In non-threading mode, gc_state() is thread-local so neither Send +// nor Sync is needed. +#[cfg(feature = "threading")] unsafe impl Send for GcState {} #[cfg(feature = "threading")] unsafe impl Sync for GcState {} From 22df0c60a0849bd332c921c1be574aa0811eb07b Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Wed, 4 Mar 2026 21:07:36 +0900 Subject: [PATCH 14/14] Clean up comments: remove redundant/stale remarks, fix CPython references --- crates/common/src/lock.rs | 7 +++---- crates/vm/src/datastack.rs | 14 ++++++-------- crates/vm/src/frame.rs | 10 ++-------- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/crates/common/src/lock.rs b/crates/common/src/lock.rs index dc8c5856626..af680010821 100644 --- a/crates/common/src/lock.rs +++ b/crates/common/src/lock.rs @@ -20,10 +20,9 @@ cfg_if::cfg_if! { } } -// LazyLock: thread-safe lazy initialization for `static` items. -// In non-threading mode with std, use std::sync::LazyLock for safety -// (Rust test runner uses parallel threads even without the threading feature). -// Without std, use a LazyCell wrapper (truly single-threaded environments only). +// LazyLock: uses std::sync::LazyLock when std is available (even without +// threading, because Rust test runner uses parallel threads). +// Without std, uses a LazyCell wrapper (truly single-threaded only). cfg_if::cfg_if! { if #[cfg(any(feature = "threading", feature = "std"))] { pub use std::sync::LazyLock; diff --git a/crates/vm/src/datastack.rs b/crates/vm/src/datastack.rs index 194c274e3d2..b00f1b6dc19 100644 --- a/crates/vm/src/datastack.rs +++ b/crates/vm/src/datastack.rs @@ -1,16 +1,14 @@ -/// Thread data stack for interpreter frames. +/// Thread data stack for interpreter frames (`_PyStackChunk` / +/// `tstate->datastack_*`). /// /// A linked list of chunks providing bump allocation for frame-local data -/// (localsplus arrays). Mirrors `_PyStackChunk` / `tstate->datastack_*` -/// from CPython. -/// -/// Normal function calls allocate from the data stack via `push()` (pointer -/// bump). Generators and coroutines use heap-allocated storage instead. +/// (localsplus arrays). Normal function calls allocate via `push()` +/// (pointer bump). Generators and coroutines use heap-allocated storage. use alloc::alloc::{alloc, dealloc}; use core::alloc::Layout; use core::ptr; -/// Minimum chunk size in bytes (16 KB, matching CPython `_PY_DATA_STACK_CHUNK_SIZE`). +/// Minimum chunk size in bytes (`_PY_DATA_STACK_CHUNK_SIZE`). const MIN_CHUNK_SIZE: usize = 16 * 1024; /// Extra headroom (in bytes) to avoid allocating a new chunk for the next @@ -66,7 +64,7 @@ impl DataStack { let top = unsafe { (*chunk).data_start() }; let limit = unsafe { (*chunk).data_limit() }; // Skip one ALIGN-sized slot in the root chunk so that `pop()` never - // frees it (same trick as CPython's `push_chunk`). + // frees it (`push_chunk` convention). let top = unsafe { top.add(ALIGN) }; Self { chunk, top, limit } } diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 9f40352e4c7..29626d104da 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -66,9 +66,6 @@ enum UnwindReason { Raising { exception: PyBaseExceptionRef }, } -// FrameState fields are stored directly on Frame as UnsafeCell. -// See Frame.localsplus, Frame.cells_frees, Frame.prev_line. - /// Tracks who owns a frame. // = `_PyFrameOwner` #[repr(i8)] @@ -203,7 +200,6 @@ impl LocalsPlus { fn materialize_to_heap(&mut self) -> Option<*mut u8> { if let LocalsPlusData::DataStack { ptr, capacity } = &self.data { let base = *ptr as *mut u8; - // Copy all data to a heap allocation (preserves locals for tracebacks). let heap_data = unsafe { core::slice::from_raw_parts(*ptr, *capacity) } .to_vec() .into_boxed_slice(); @@ -461,7 +457,6 @@ impl ExactSizeIterator for LocalsPlusStackDrain<'_> {} impl Drop for LocalsPlusStackDrain<'_> { fn drop(&mut self) { - // Drop any remaining elements that weren't consumed. while self.current < self.end { let idx = self.localsplus.nlocalsplus as usize + self.current; let data = self.localsplus.data_as_mut_slice(); @@ -671,7 +666,6 @@ impl Frame { .chain(closure.iter().cloned()) .collect(); - // Create unified localsplus: varnames + cellvars + freevars + stack let nlocalsplus = nlocals .checked_add(num_cells) .and_then(|v| v.checked_add(nfrees)) @@ -1001,8 +995,8 @@ impl Py { } } -/// An executing frame; essentially just a struct to combine the immutable data outside the mutex -/// with the mutable data inside +/// An executing frame; borrows mutable frame-internal data for the duration +/// of bytecode execution. struct ExecutingFrame<'a> { code: &'a PyRef, localsplus: &'a mut LocalsPlus,