diff --git a/Lib/test/test_tstring.py b/Lib/test/test_tstring.py index e91bf3f8b4e..a1f686c8f56 100644 --- a/Lib/test/test_tstring.py +++ b/Lib/test/test_tstring.py @@ -4,7 +4,6 @@ class TestTString(unittest.TestCase, TStringBaseCase): - @unittest.expectedFailure # TODO: RUSTPYTHON; + Template(strings=('Hello',), interpolations=()) def test_string_representation(self): # Test __repr__ t = t"Hello" diff --git a/crates/common/src/str.rs b/crates/common/src/str.rs index 79c407909ff..38e73a683f2 100644 --- a/crates/common/src/str.rs +++ b/crates/common/src/str.rs @@ -258,6 +258,7 @@ impl StrData { &self.data } + // TODO: rename to to_str #[inline] pub fn as_str(&self) -> Option<&str> { self.kind @@ -429,13 +430,13 @@ pub fn zfill(bytes: &[u8], width: usize) -> Vec { /// Convert a string to ascii compatible, escaping unicode-s into escape /// sequences. -pub fn to_ascii(value: &str) -> AsciiString { +pub fn to_ascii(value: &Wtf8) -> AsciiString { let mut ascii = Vec::new(); - for c in value.chars() { - if c.is_ascii() { - ascii.push(c as u8); + for cp in value.code_points() { + if cp.is_ascii() { + ascii.push(cp.to_u32() as u8); } else { - let c = c as i64; + let c = cp.to_u32(); let hex = if c < 0x100 { format!("\\x{c:02x}") } else if c < 0x10000 { diff --git a/crates/stdlib/src/_asyncio.rs b/crates/stdlib/src/_asyncio.rs index d7b5dad3c9d..2733e801251 100644 --- a/crates/stdlib/src/_asyncio.rs +++ b/crates/stdlib/src/_asyncio.rs @@ -6,6 +6,7 @@ pub(crate) use _asyncio::module_def; #[pymodule] pub(crate) mod _asyncio { + use crate::common::wtf8::{Wtf8Buf, wtf8_concat}; use crate::{ common::lock::PyRwLock, vm::{ @@ -859,7 +860,7 @@ pub(crate) mod _asyncio { } } - fn get_future_repr_info(future: &PyObject, vm: &VirtualMachine) -> PyResult { + fn get_future_repr_info(future: &PyObject, vm: &VirtualMachine) -> PyResult { // Try to use asyncio.base_futures._future_repr_info // Import from sys.modules if available, otherwise try regular import let sys_modules = vm.sys_module.get_attr("modules", vm)?; @@ -892,29 +893,34 @@ pub(crate) mod _asyncio { Err(_) => return get_future_repr_info_fallback(future, vm), }; - let parts: Vec = list - .borrow_vec() - .iter() - .filter_map(|x: &PyObjectRef| x.str(vm).ok().map(|s| s.as_str().to_string())) - .collect(); - Ok(parts.join(" ")) + let mut result = Wtf8Buf::new(); + let parts = list.borrow_vec(); + for (i, x) in parts.iter().enumerate() { + if i > 0 { + result.push_str(" "); + } + if let Ok(s) = x.str(vm) { + result.push_wtf8(s.as_wtf8()); + } + } + Ok(result) } - fn get_future_repr_info_fallback(future: &PyObject, vm: &VirtualMachine) -> PyResult { + fn get_future_repr_info_fallback(future: &PyObject, vm: &VirtualMachine) -> PyResult { // Fallback: build repr from properties directly if let Ok(Some(state)) = vm.get_attribute_opt(future.to_owned(), vm.ctx.intern_str("_state")) { - let state_str = state + let s = state .str(vm) - .map(|s| s.as_str().to_lowercase()) - .unwrap_or_else(|_| "unknown".to_string()); - return Ok(state_str); + .map(|s| s.as_wtf8().to_lowercase()) + .unwrap_or_else(|_| Wtf8Buf::from("unknown")); + return Ok(s); } - Ok("state=unknown".to_string()) + Ok(Wtf8Buf::from("state=unknown")) } - fn get_task_repr_info(task: &PyObject, vm: &VirtualMachine) -> PyResult { + fn get_task_repr_info(task: &PyObject, vm: &VirtualMachine) -> PyResult { // vm.import returns the top-level module, get base_tasks submodule match vm .import("asyncio.base_tasks", 0) @@ -927,12 +933,15 @@ pub(crate) mod _asyncio { let list: PyListRef = info.downcast().map_err(|_| { vm.new_type_error("_task_repr_info should return a list") })?; - let parts: Vec = list - .borrow_vec() - .iter() - .map(|x: &PyObjectRef| x.str(vm).map(|s| s.as_str().to_string())) - .collect::>>()?; - Ok(parts.join(" ")) + let mut result = Wtf8Buf::new(); + let parts = list.borrow_vec(); + for (i, x) in parts.iter().enumerate() { + if i > 0 { + result.push_str(" "); + } + result.push_wtf8(x.str(vm)?.as_wtf8()); + } + Ok(result) } _ => get_future_repr_info(task, vm), } @@ -1928,40 +1937,28 @@ pub(crate) mod _asyncio { } impl Representable for PyTask { - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let class_name = zelf.class().name().to_string(); if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { // Try to use _task_repr_info if available if let Ok(info) = get_task_repr_info(zelf.as_object(), vm) - && info != "state=unknown" + && info.as_bytes() != b"state=unknown" { - return Ok(format!("<{} {}>", class_name, info)); + return Ok(wtf8_concat!("<", class_name, " ", info, ">")); } // Fallback: build repr from task properties directly let state = zelf.base.fut_state.load().as_str().to_lowercase(); - let name = zelf - .task_name - .read() - .as_ref() - .and_then(|n| n.str(vm).ok()) - .map(|s| s.as_str().to_string()) - .unwrap_or_else(|| "?".to_string()); - let coro_repr = zelf - .task_coro - .read() - .as_ref() - .and_then(|c| c.repr(vm).ok()) - .map(|s| s.as_str().to_string()) - .unwrap_or_else(|| "?".to_string()); - - Ok(format!( - "<{} {} name='{}' coro={}>", - class_name, state, name, coro_repr + let name = zelf.task_name.read().as_ref().and_then(|n| n.str(vm).ok()); + let coro_repr = zelf.task_coro.read().as_ref().and_then(|c| c.repr(vm).ok()); + let name = name.as_ref().map_or("?".as_ref(), |s| s.as_wtf8()); + let coro_repr = coro_repr.as_ref().map_or("?".as_ref(), |s| s.as_wtf8()); + Ok(wtf8_concat!( + "<", class_name, " ", state, " name='", name, "' coro=", coro_repr, ">" )) } else { - Ok(format!("<{} ...>", class_name)) + Ok(Wtf8Buf::from(format!("<{class_name} ...>"))) } } } @@ -2151,10 +2148,8 @@ pub(crate) mod _asyncio { // Check if task awaits on itself let task_obj: PyObjectRef = task.clone().into(); if result.is(&task_obj) { - let msg = format!( - "Task cannot await on itself: {}", - task_obj.repr(vm)?.as_str() - ); + let task_repr = task_obj.repr(vm)?; + let msg = format!("Task cannot await on itself: {}", task_repr.as_wtf8()); task.base.fut_state.store(FutureState::Finished); *task.base.fut_exception.write() = Some(vm.new_runtime_error(msg).into()); PyTask::schedule_callbacks(task, vm)?; @@ -2254,7 +2249,8 @@ pub(crate) mod _asyncio { vm.call_method(&loop_obj, "call_soon", (step_wrapper,))?; } } else { - let msg = format!("Task got bad yield: {}", result.repr(vm)?.as_str()); + let result_repr = result.repr(vm)?; + let msg = format!("Task got bad yield: {}", result_repr.as_wtf8()); task.base.fut_state.store(FutureState::Finished); *task.base.fut_exception.write() = Some(vm.new_runtime_error(msg).into()); PyTask::schedule_callbacks(task, vm)?; diff --git a/crates/stdlib/src/_sqlite3.rs b/crates/stdlib/src/_sqlite3.rs index f7ae445fe81..dbb13577514 100644 --- a/crates/stdlib/src/_sqlite3.rs +++ b/crates/stdlib/src/_sqlite3.rs @@ -89,7 +89,7 @@ mod _sqlite3 { $( #[allow(dead_code)] fn [](vm: &VirtualMachine, msg: String) -> PyBaseExceptionRef { - vm.new_exception_msg([<$x:snake _type>]().to_owned(), msg) + vm.new_exception_msg([<$x:snake _type>]().to_owned(), msg.into()) } fn [<$x:snake _type>]() -> &'static Py { [<$x:snake:upper>].get().expect("exception type not initialize") @@ -723,7 +723,7 @@ mod _sqlite3 { converter: ArgCallable, vm: &VirtualMachine, ) -> PyResult<()> { - let name = typename.as_str().to_uppercase(); + let name = typename.expect_str().to_uppercase(); converters().set_item(&name, converter.into(), vm) } @@ -2194,8 +2194,8 @@ mod _sqlite3 { let Some(obj) = obj.downcast_ref::() else { break; }; - let a_iter = name.as_str().chars().flat_map(|x| x.to_uppercase()); - let b_iter = obj.as_str().chars().flat_map(|x| x.to_uppercase()); + let a_iter = name.expect_str().chars().flat_map(|x| x.to_uppercase()); + let b_iter = obj.expect_str().chars().flat_map(|x| x.to_uppercase()); if a_iter.eq(b_iter) { return self.data.getitem_by_index(vm, i); @@ -2918,7 +2918,7 @@ mod _sqlite3 { }; let mut s = Vec::with_capacity(16); s.extend(b"BEGIN "); - s.extend(isolation_level.as_str().bytes()); + s.extend(isolation_level.expect_str().bytes()); s.push(b'\0'); self._exec(&s, vm) } @@ -3469,7 +3469,7 @@ mod _sqlite3 { return e; } - vm.new_exception_msg_dict(typ, msg, dict) + vm.new_exception_msg_dict(typ, msg.into(), dict) } static BEGIN_STATEMENTS: &[&[u8]] = &[ diff --git a/crates/stdlib/src/array.rs b/crates/stdlib/src/array.rs index 9877d0dcc4f..656b5028623 100644 --- a/crates/stdlib/src/array.rs +++ b/crates/stdlib/src/array.rs @@ -19,7 +19,7 @@ mod array { builtins::{ PositionIterInternal, PyByteArray, PyBytes, PyBytesRef, PyDictRef, PyFloat, PyGenericAlias, PyInt, PyList, PyListRef, PyStr, PyStrRef, PyTupleRef, PyType, - PyTypeRef, builtins_iter, + PyTypeRef, PyUtf8StrRef, builtins_iter, }, class_or_notimplemented, convert::{ToPyObject, ToPyResult, TryFromBorrowedObject, TryFromObject}, @@ -559,7 +559,7 @@ mod array { impl ArrayElement for WideChar { fn try_into_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult { - PyStrRef::try_from_object(vm, obj)? + PyUtf8StrRef::try_from_object(vm, obj)? .as_str() .chars() .exactly_one() @@ -625,7 +625,7 @@ mod array { #[derive(FromArgs)] pub struct ArrayNewArgs { #[pyarg(positional)] - spec: PyStrRef, + spec: PyUtf8StrRef, #[pyarg(positional, optional)] init: OptionalArg, } @@ -884,7 +884,7 @@ mod array { if not_enough_bytes { Err(vm.new_exception_msg( vm.ctx.exceptions.eof_error.to_owned(), - "read() didn't return enough bytes".to_owned(), + "read() didn't return enough bytes".into(), )) } else { Ok(()) @@ -1425,7 +1425,7 @@ mod array { #[pyarg(positional)] arraytype: PyTypeRef, #[pyarg(positional)] - typecode: PyStrRef, + typecode: PyUtf8StrRef, #[pyarg(positional)] mformat_code: MachineFormatCode, #[pyarg(positional)] @@ -1568,7 +1568,7 @@ mod array { Ok(typ) } - fn check_type_code(spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn check_type_code(spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { let spec = spec.as_str().chars().exactly_one().map_err(|_| { vm.new_type_error( "_array_reconstructor() argument 2 must be a unicode character, not str", diff --git a/crates/stdlib/src/binascii.rs b/crates/stdlib/src/binascii.rs index ee55d482e4c..9af56d4eea7 100644 --- a/crates/stdlib/src/binascii.rs +++ b/crates/stdlib/src/binascii.rs @@ -849,7 +849,7 @@ mod decl { struct Base64DecodeError(base64::DecodeError); fn new_binascii_error(msg: String, vm: &VirtualMachine) -> PyBaseExceptionRef { - vm.new_exception_msg(decl::error_type(vm), msg) + vm.new_exception_msg(decl::error_type(vm), msg.into()) } impl ToPyException for Base64DecodeError { diff --git a/crates/stdlib/src/contextvars.rs b/crates/stdlib/src/contextvars.rs index 883ef6f6820..1fc98edb5a7 100644 --- a/crates/stdlib/src/contextvars.rs +++ b/crates/stdlib/src/contextvars.rs @@ -15,7 +15,7 @@ mod _contextvars { AsObject, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, atomic_func, builtins::{PyGenericAlias, PyList, PyStrRef, PyType, PyTypeRef}, class::StaticType, - common::hash::PyHash, + common::{hash::PyHash, wtf8::Wtf8Buf}, function::{ArgCallable, FuncArgs, OptionalArg}, protocol::{PyMappingMethods, PySequenceMethods}, types::{AsMapping, AsSequence, Constructor, Hashable, Iterable, Representable}, @@ -333,7 +333,7 @@ mod _contextvars { if vars.swap_remove(zelf).is_none() { // TODO: // PyErr_SetObject(PyExc_LookupError, (PyObject *)var); - let msg = zelf.as_object().repr(vm)?.as_str().to_owned(); + let msg = zelf.as_object().repr(vm)?.as_wtf8().to_owned(); return Err(vm.new_lookup_error(msg)); } @@ -409,7 +409,7 @@ mod _contextvars { default.clone() } else { let msg = zelf.as_object().repr(vm)?; - return Err(vm.new_lookup_error(msg.as_str().to_owned())); + return Err(vm.new_lookup_error(msg.as_wtf8().to_owned())); }; Ok(Some(value)) } @@ -611,11 +611,14 @@ mod _contextvars { impl Representable for ContextToken { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let used = if zelf.used.get() { " used" } else { "" }; - let var = Representable::repr_str(&zelf.var, vm)?; + let var = Representable::repr_wtf8(&zelf.var, vm)?; let ptr = zelf.as_object().get_id() as *const u8; - Ok(format!("")) + let mut result = Wtf8Buf::from(format!("")); + Ok(result) } } diff --git a/crates/stdlib/src/csv.rs b/crates/stdlib/src/csv.rs index b898dc8c106..a1147fd1cbb 100644 --- a/crates/stdlib/src/csv.rs +++ b/crates/stdlib/src/csv.rs @@ -6,7 +6,7 @@ mod _csv { use crate::vm::{ AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, - builtins::{PyBaseExceptionRef, PyInt, PyNone, PyStr, PyType, PyTypeRef}, + builtins::{PyBaseExceptionRef, PyInt, PyNone, PyStr, PyType, PyTypeRef, PyUtf8StrRef}, function::{ArgIterable, ArgumentError, FromArgs, FuncArgs, OptionalArg}, protocol::{PyIter, PyIterReturn}, raise_if_stop, @@ -16,7 +16,7 @@ mod _csv { use csv_core::Terminator; use itertools::{self, Itertools}; use parking_lot::Mutex; - use rustpython_common::lock::LazyLock; + use rustpython_common::{lock::LazyLock, wtf8::Wtf8Buf}; use rustpython_vm::{match_class, sliceable::SliceableSequenceOp}; use std::collections::HashMap; @@ -50,8 +50,8 @@ mod _csv { }); static GLOBAL_FIELD_LIMIT: LazyLock> = LazyLock::new(|| Mutex::new(131072)); - fn new_csv_error(vm: &VirtualMachine, msg: String) -> PyBaseExceptionRef { - vm.new_exception_msg(super::_csv::error(vm), msg) + fn new_csv_error(vm: &VirtualMachine, msg: impl Into) -> PyBaseExceptionRef { + vm.new_exception_msg(super::_csv::error(vm), msg.into()) } #[pyattr] @@ -138,7 +138,7 @@ mod _csv { } else { match_class!(match obj.to_owned() { s @ PyStr => { - Ok(s.as_str().bytes().exactly_one().map_err(|_| { + Ok(s.as_bytes().iter().copied().exactly_one().map_err(|_| { vm.new_type_error(format!( r#""delimiter" must be a unicode character, not a string of length {}"#, s.len() @@ -159,19 +159,16 @@ mod _csv { fn parse_quotechar_from_obj(vm: &VirtualMachine, obj: &PyObject) -> PyResult> { match_class!(match obj.get_attr("quotechar", vm)? { s @ PyStr => { - Ok(Some(s.as_str().bytes().exactly_one().map_err(|_| { - vm.new_exception_msg( - super::_csv::error(vm), - format!(r#""quotechar" must be a unicode character or None, not a string of length {}"#, s.len()), - ) + Ok(Some(s.as_bytes().iter().copied().exactly_one().map_err(|_| { + new_csv_error(vm, format!(r#""quotechar" must be a unicode character or None, not a string of length {}"#, s.len())) })?)) } _n @ PyNone => { Ok(None) } attr => { - Err(vm.new_exception_msg( - super::_csv::error(vm), + Err(new_csv_error( + vm, format!( r#""quotechar" must be a unicode character or None, not {}"#, attr.class() @@ -183,9 +180,9 @@ mod _csv { fn parse_escapechar_from_obj(vm: &VirtualMachine, obj: &PyObject) -> PyResult> { match_class!(match obj.get_attr("escapechar", vm)? { s @ PyStr => { - Ok(Some(s.as_str().bytes().exactly_one().map_err(|_| { - vm.new_exception_msg( - super::_csv::error(vm), + Ok(Some(s.as_bytes().iter().copied().exactly_one().map_err(|_| { + new_csv_error( + vm, format!(r#""escapechar" must be a unicode character or None, not a string of length {}"#, s.len()), ) })?)) @@ -213,10 +210,7 @@ mod _csv { // only capture the first character csv_core::Terminator::Any(*t) } else { - return Err(vm.new_exception_msg( - super::_csv::error(vm), - r#""lineterminator" must be a string"#.to_owned(), - )); + return Err(new_csv_error(vm, r#""lineterminator" must be a string"#)); }) } attr => { @@ -278,9 +272,10 @@ mod _csv { mut _rest: FuncArgs, vm: &VirtualMachine, ) -> PyResult<()> { - let Some(name) = name.downcast_ref::() else { - return Err(vm.new_type_error("argument 0 must be a string")); - }; + let name = name + .downcast::() + .map_err(|_| vm.new_type_error("argument 0 must be a string"))?; + let name: PyUtf8StrRef = name.try_into_utf8(vm)?; let dialect = match dialect { OptionalArg::Present(d) => PyDialect::try_from_object(vm, d) .map_err(|_| vm.new_type_error("argument 1 must be a dialect object"))?, @@ -299,17 +294,18 @@ mod _csv { mut _rest: FuncArgs, vm: &VirtualMachine, ) -> PyResult { - let Some(name) = name.downcast_ref::() else { - return Err(vm.new_exception_msg( - super::_csv::error(vm), - format!("argument 0 must be a string, not '{}'", name.class()), - )); - }; + let name = name.downcast::().map_err(|obj| { + new_csv_error( + vm, + format!("argument 0 must be a string, not '{}'", obj.class()), + ) + })?; + let name: PyUtf8StrRef = name.try_into_utf8(vm)?; let g = GLOBAL_HASHMAP.lock(); if let Some(dialect) = g.get(name.as_str()) { return Ok(*dialect); } - Err(vm.new_exception_msg(super::_csv::error(vm), "unknown dialect".to_string())) + Err(new_csv_error(vm, "unknown dialect")) } #[pyfunction] @@ -318,17 +314,18 @@ mod _csv { mut _rest: FuncArgs, vm: &VirtualMachine, ) -> PyResult<()> { - let Some(name) = name.downcast_ref::() else { - return Err(vm.new_exception_msg( - super::_csv::error(vm), - format!("argument 0 must be a string, not '{}'", name.class()), - )); - }; + let name = name.downcast::().map_err(|obj| { + new_csv_error( + vm, + format!("argument 0 must be a string, not '{}'", obj.class()), + ) + })?; + let name: PyUtf8StrRef = name.try_into_utf8(vm)?; let mut g = GLOBAL_HASHMAP.lock(); if let Some(_removed) = g.remove(name.as_str()) { return Ok(()); } - Err(vm.new_exception_msg(super::_csv::error(vm), "unknown dialect".to_string())) + Err(new_csv_error(vm, "unknown dialect")) } #[pyfunction] @@ -537,7 +534,8 @@ mod _csv { ) -> Result { match_class!(match obj { s @ PyStr => { - Ok(DialectItem::Str(s.as_str().to_string())) + let s = s.try_into_utf8(vm).map_err(ArgumentError::Exception)?; + Ok(DialectItem::Str(s.as_str().to_owned())) } PyNone => { Err(ArgumentError::InvalidKeywordArgument("dialect".to_string())) @@ -581,10 +579,11 @@ mod _csv { if let Some(escapechar) = args.kwargs.swap_remove("escapechar") { res.escapechar = match_class!(match escapechar { - s @ PyStr => Some(s.as_str().bytes().exactly_one().map_err(|_| { - let msg = r#""escapechar" must be a 1-character string"#; - vm.new_type_error(msg.to_owned()) - })?), + s @ PyStr => + Some(s.as_bytes().iter().copied().exactly_one().map_err(|_| { + let msg = r#""escapechar" must be a 1-character string"#; + vm.new_type_error(msg.to_owned()) + })?), _ => None, }) }; @@ -626,10 +625,12 @@ mod _csv { }; if let Some(quotechar) = args.kwargs.swap_remove("quotechar") { res.quotechar = match_class!(match quotechar { - s @ PyStr => Some(Some(s.as_str().bytes().exactly_one().map_err(|_| { - let msg = r#""quotechar" must be a 1-character string"#; - vm.new_type_error(msg.to_owned()) - })?)), + s @ PyStr => Some(Some(s.as_bytes().iter().copied().exactly_one().map_err( + |_| { + let msg = r#""quotechar" must be a 1-character string"#; + vm.new_type_error(msg.to_owned()) + } + )?)), PyNone => { if let Some(QuoteStyle::All) = res.quoting { let msg = "quotechar must be set if quoting enabled"; diff --git a/crates/stdlib/src/grp.rs b/crates/stdlib/src/grp.rs index c1a52eee62e..eec901b0e57 100644 --- a/crates/stdlib/src/grp.rs +++ b/crates/stdlib/src/grp.rs @@ -5,7 +5,7 @@ pub(crate) use grp::module_def; mod grp { use crate::vm::{ PyObjectRef, PyResult, VirtualMachine, - builtins::{PyIntRef, PyListRef, PyStrRef}, + builtins::{PyIntRef, PyListRef, PyUtf8StrRef}, convert::{IntoPyException, ToPyObject}, exceptions, types::PyStructSequence, @@ -67,7 +67,7 @@ mod grp { } #[pyfunction] - fn getgrnam(name: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn getgrnam(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { let gr_name = name.as_str(); if gr_name.contains('\0') { return Err(exceptions::cstring_error(vm)); diff --git a/crates/stdlib/src/hashlib.rs b/crates/stdlib/src/hashlib.rs index 5097d804a79..441b8f44815 100644 --- a/crates/stdlib/src/hashlib.rs +++ b/crates/stdlib/src/hashlib.rs @@ -11,7 +11,7 @@ pub mod _hashlib { use crate::vm::{ Py, PyObjectRef, PyPayload, PyResult, VirtualMachine, builtins::{ - PyBaseExceptionRef, PyBytes, PyFrozenSet, PyStr, PyStrRef, PyTypeRef, PyValueError, + PyBaseExceptionRef, PyBytes, PyFrozenSet, PyStr, PyTypeRef, PyUtf8StrRef, PyValueError, }, class::StaticType, convert::ToPyObject, @@ -79,7 +79,7 @@ pub mod _hashlib { #[allow(unused)] struct NewHashArgs { #[pyarg(positional)] - name: PyStrRef, + name: PyUtf8StrRef, #[pyarg(any, optional)] data: OptionalArg, #[pyarg(named, default = true)] @@ -188,7 +188,7 @@ pub mod _hashlib { #[allow(unused)] struct Pbkdf2HmacArgs { #[pyarg(any)] - hash_name: PyStrRef, + hash_name: PyUtf8StrRef, #[pyarg(any)] password: ArgBytesLike, #[pyarg(any)] @@ -218,18 +218,21 @@ pub mod _hashlib { } fn resolve_digestmod(digestmod: &PyObjectRef, vm: &VirtualMachine) -> PyResult { - if let Some(name) = digestmod.downcast_ref::() { - return Ok(name.as_str().to_lowercase()); + if let Some(name) = digestmod.downcast_ref::() + && let Some(name_str) = name.to_str() + { + return Ok(name_str.to_lowercase()); } if let Ok(name_obj) = digestmod.get_attr("__name__", vm) && let Some(name) = name_obj.downcast_ref::() - && let Some(algo) = name.as_str().strip_prefix("openssl_") + && let Some(name_str) = name.to_str() + && let Some(algo) = name_str.strip_prefix("openssl_") { return Ok(algo.to_owned()); } Err(vm.new_exception_msg( UnsupportedDigestmodError::static_type().to_owned(), - "unsupported digestmod".to_owned(), + "unsupported digestmod".into(), )) } @@ -254,7 +257,7 @@ pub mod _hashlib { fn unsupported_hash(name: &str, vm: &VirtualMachine) -> PyBaseExceptionRef { vm.new_exception_msg( UnsupportedDigestmodError::static_type().to_owned(), - format!("unsupported hash type {name}"), + format!("unsupported hash type {name}").into(), ) } diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index 3baeba629c8..41c3dead090 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -6,7 +6,7 @@ mod _json { use super::machinery; use crate::vm::{ AsObject, Py, PyObjectRef, PyPayload, PyResult, VirtualMachine, - builtins::{PyBaseExceptionRef, PyStrRef, PyType}, + builtins::{PyBaseExceptionRef, PyStrRef, PyType, PyUtf8StrRef}, convert::ToPyResult, function::{IntoFuncArgs, OptionalArg}, protocol::PyIterReturn, @@ -91,14 +91,14 @@ mod _json { impl JsonScanner { fn parse( &self, - pystr: PyStrRef, + pystr: PyUtf8StrRef, char_idx: usize, byte_idx: usize, scan_once: PyObjectRef, vm: &VirtualMachine, ) -> PyResult { flame_guard!("JsonScanner::parse"); - let bytes = pystr.as_str().as_bytes(); + let bytes = pystr.as_bytes(); let wtf8 = pystr.as_wtf8(); let first_byte = match bytes.get(byte_idx) { @@ -115,7 +115,7 @@ mod _json { // Parse string - pass slice starting after the quote let (wtf8_result, chars_consumed, _bytes_consumed) = machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) - .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + .map_err(|e| py_decode_error(e, pystr.clone().into_wtf8(), vm))?; let end_char_idx = char_idx + 1 + chars_consumed; return Ok(PyIterReturn::Return( vm.new_tuple((wtf8_result, end_char_idx)).into(), @@ -228,7 +228,7 @@ mod _json { /// Returns (parsed_object, end_char_index, end_byte_index). fn parse_object( &self, - pystr: PyStrRef, + pystr: PyUtf8StrRef, start_char_idx: usize, start_byte_idx: usize, scan_once: &PyObjectRef, @@ -237,7 +237,7 @@ mod _json { ) -> PyResult<(PyObjectRef, usize, usize)> { flame_guard!("JsonScanner::parse_object"); - let bytes = pystr.as_str().as_bytes(); + let bytes = pystr.as_bytes(); let wtf8 = pystr.as_wtf8(); let mut char_idx = start_char_idx; let mut byte_idx = start_byte_idx; @@ -275,7 +275,7 @@ mod _json { // Parse key string using scanstring with byte slice let (key_wtf8, chars_consumed, bytes_consumed) = machinery::scanstring(&wtf8[byte_idx..], char_idx, self.strict) - .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + .map_err(|e| py_decode_error(e, pystr.clone().into_wtf8(), vm))?; char_idx += chars_consumed; byte_idx += bytes_consumed; @@ -389,7 +389,7 @@ mod _json { /// Returns (parsed_array, end_char_index, end_byte_index). fn parse_array( &self, - pystr: PyStrRef, + pystr: PyUtf8StrRef, start_char_idx: usize, start_byte_idx: usize, scan_once: &PyObjectRef, @@ -398,7 +398,7 @@ mod _json { ) -> PyResult<(PyObjectRef, usize, usize)> { flame_guard!("JsonScanner::parse_array"); - let bytes = pystr.as_str().as_bytes(); + let bytes = pystr.as_bytes(); let mut char_idx = start_char_idx; let mut byte_idx = start_byte_idx; @@ -507,15 +507,15 @@ mod _json { fn call_scan_once( &self, scan_once: &PyObjectRef, - pystr: PyStrRef, + pystr: PyUtf8StrRef, char_idx: usize, byte_idx: usize, memo: &mut HashMap, vm: &VirtualMachine, ) -> PyResult<(PyObjectRef, usize, usize)> { - let s = pystr.as_str(); - let bytes = s.as_bytes(); + let bytes = pystr.as_bytes(); let wtf8 = pystr.as_wtf8(); + let s = pystr.as_str(); let first_byte = match bytes.get(byte_idx) { Some(&b) => b, @@ -527,7 +527,7 @@ mod _json { // String - pass slice starting after the quote let (wtf8_result, chars_consumed, bytes_consumed) = machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) - .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + .map_err(|e| py_decode_error(e, pystr.clone().into_wtf8(), vm))?; let py_str = vm.ctx.new_str(wtf8_result.to_string()); Ok(( py_str.into(), @@ -620,12 +620,12 @@ mod _json { fn make_decode_error( &self, msg: &str, - s: PyStrRef, + s: PyUtf8StrRef, pos: usize, vm: &VirtualMachine, ) -> PyBaseExceptionRef { let err = machinery::DecodeError::new(msg, pos); - py_decode_error(err, s, vm) + py_decode_error(err, s.into_wtf8(), vm) } } @@ -636,6 +636,7 @@ mod _json { return Err(vm.new_value_error("idx cannot be negative")); } let char_idx = char_idx as usize; + let pystr = pystr.try_into_utf8(vm)?; let s = pystr.as_str(); // Calculate byte index from char index (O(char_idx) but only at entry point) @@ -652,14 +653,8 @@ mod _json { } }; - zelf.parse( - pystr.clone(), - char_idx, - byte_idx, - zelf.to_owned().into(), - vm, - ) - .and_then(|x| x.to_pyresult(vm)) + zelf.parse(pystr, char_idx, byte_idx, zelf.to_owned().into(), vm) + .and_then(|x| x.to_pyresult(vm)) } } @@ -674,12 +669,12 @@ mod _json { } #[pyfunction] - fn encode_basestring(s: PyStrRef) -> String { + fn encode_basestring(s: PyUtf8StrRef) -> String { encode_string(s.as_str(), false) } #[pyfunction] - fn encode_basestring_ascii(s: PyStrRef) -> String { + fn encode_basestring_ascii(s: PyUtf8StrRef) -> String { encode_string(s.as_str(), true) } diff --git a/crates/stdlib/src/locale.rs b/crates/stdlib/src/locale.rs index 496325b5038..a22c6afe57e 100644 --- a/crates/stdlib/src/locale.rs +++ b/crates/stdlib/src/locale.rs @@ -45,7 +45,7 @@ mod _locale { use core::{ffi::CStr, ptr}; use rustpython_vm::{ PyObjectRef, PyResult, VirtualMachine, - builtins::{PyDictRef, PyIntRef, PyListRef, PyStrRef, PyTypeRef}, + builtins::{PyDictRef, PyIntRef, PyListRef, PyTypeRef, PyUtf8StrRef}, convert::ToPyException, function::OptionalArg, }; @@ -149,14 +149,14 @@ mod _locale { } #[pyfunction] - fn strcoll(string1: PyStrRef, string2: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn strcoll(string1: PyUtf8StrRef, string2: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { let cstr1 = CString::new(string1.as_str()).map_err(|e| e.to_pyexception(vm))?; let cstr2 = CString::new(string2.as_str()).map_err(|e| e.to_pyexception(vm))?; Ok(vm.new_pyobj(unsafe { libc::strcoll(cstr1.as_ptr(), cstr2.as_ptr()) })) } #[pyfunction] - fn strxfrm(string: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn strxfrm(string: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // https://github.com/python/cpython/blob/eaae563b6878aa050b4ad406b67728b6b066220e/Modules/_localemodule.c#L390-L442 let n1 = string.byte_len() + 1; let mut buff = vec![0u8; n1]; @@ -227,7 +227,7 @@ mod _locale { #[pyarg(any)] category: i32, #[pyarg(any, optional)] - locale: OptionalArg>, + locale: OptionalArg>, } /// Maximum code page encoding name length on Windows @@ -262,34 +262,34 @@ mod _locale { fn setlocale(args: LocaleArgs, vm: &VirtualMachine) -> PyResult { let error = error(vm); if cfg!(windows) && (args.category < LC_ALL || args.category > LC_TIME) { - return Err(vm.new_exception_msg(error, String::from("unsupported locale setting"))); + return Err(vm.new_exception_msg(error, "unsupported locale setting".into())); } unsafe { let result = match args.locale.flatten() { None => libc::setlocale(args.category, ptr::null()), Some(locale) => { + let locale_str = locale.as_str(); // On Windows, validate encoding name length #[cfg(windows)] { let valid = if args.category == LC_ALL { - check_locale_name_all(locale.as_str()) + check_locale_name_all(locale_str) } else { - check_locale_name(locale.as_str()) + check_locale_name(locale_str) }; if !valid { - return Err(vm.new_exception_msg( - error, - String::from("unsupported locale setting"), - )); + return Err( + vm.new_exception_msg(error, "unsupported locale setting".into()) + ); } } let c_locale: CString = - CString::new(locale.as_str()).map_err(|e| e.to_pyexception(vm))?; + CString::new(locale_str).map_err(|e| e.to_pyexception(vm))?; libc::setlocale(args.category, c_locale.as_ptr()) } }; if result.is_null() { - return Err(vm.new_exception_msg(error, String::from("unsupported locale setting"))); + return Err(vm.new_exception_msg(error, "unsupported locale setting".into())); } pystr_from_raw_cstr(vm, result) } diff --git a/crates/stdlib/src/lzma.rs b/crates/stdlib/src/lzma.rs index 80e4ce80755..d9f19f0f24e 100644 --- a/crates/stdlib/src/lzma.rs +++ b/crates/stdlib/src/lzma.rs @@ -82,7 +82,8 @@ mod _lzma { } fn new_lzma_error(message: impl Into, vm: &VirtualMachine) -> PyBaseExceptionRef { - vm.new_exception_msg(vm.class("lzma", "LZMAError"), message.into()) + let msg: String = message.into(); + vm.new_exception_msg(vm.class("lzma", "LZMAError"), msg.into()) } #[pyfunction] diff --git a/crates/stdlib/src/multiprocessing.rs b/crates/stdlib/src/multiprocessing.rs index cad20972ac1..fe52cbd19fc 100644 --- a/crates/stdlib/src/multiprocessing.rs +++ b/crates/stdlib/src/multiprocessing.rs @@ -224,7 +224,7 @@ mod _multiprocessing { if !ismine!(self) { return Err(vm.new_exception_msg( vm.ctx.exceptions.assertion_error.to_owned(), - "attempt to release recursive lock not owned by thread".to_owned(), + "attempt to release recursive lock not owned by thread".into(), )); } if self.count.load(Ordering::Acquire) > 1 { @@ -796,7 +796,7 @@ mod _multiprocessing { if !ismine!(self) { return Err(vm.new_exception_msg( vm.ctx.exceptions.assertion_error.to_owned(), - "attempt to release recursive lock not owned by thread".to_owned(), + "attempt to release recursive lock not owned by thread".into(), )); } // if (self->count > 1) { --self->count; Py_RETURN_NONE; } diff --git a/crates/stdlib/src/overlapped.rs b/crates/stdlib/src/overlapped.rs index 779fe31efe5..1243a1297ea 100644 --- a/crates/stdlib/src/overlapped.rs +++ b/crates/stdlib/src/overlapped.rs @@ -309,7 +309,7 @@ mod _overlapped { let mut addr: SOCKADDR_IN = unsafe { core::mem::zeroed() }; addr.sin_family = AF_INET; - let host_wide: Vec = host.as_str().encode_utf16().chain([0]).collect(); + let host_wide: Vec = host.as_wtf8().encode_wide().chain([0]).collect(); let mut addr_len = core::mem::size_of::() as i32; let ret = unsafe { @@ -348,7 +348,7 @@ mod _overlapped { let mut addr: SOCKADDR_IN6 = unsafe { core::mem::zeroed() }; addr.sin6_family = AF_INET6; - let host_wide: Vec = host.as_str().encode_utf16().chain([0]).collect(); + let host_wide: Vec = host.as_wtf8().encode_wide().chain([0]).collect(); let mut addr_len = core::mem::size_of::() as i32; let ret = unsafe { diff --git a/crates/stdlib/src/posixshmem.rs b/crates/stdlib/src/posixshmem.rs index 2a142d8b6f3..f5481619bba 100644 --- a/crates/stdlib/src/posixshmem.rs +++ b/crates/stdlib/src/posixshmem.rs @@ -8,13 +8,15 @@ mod _posixshmem { use crate::{ common::os::errno_io_error, - vm::{FromArgs, PyResult, VirtualMachine, builtins::PyStrRef, convert::IntoPyException}, + vm::{ + FromArgs, PyResult, VirtualMachine, builtins::PyUtf8StrRef, convert::IntoPyException, + }, }; #[derive(FromArgs)] struct ShmOpenArgs { #[pyarg(any)] - name: PyStrRef, + name: PyUtf8StrRef, #[pyarg(any)] flags: libc::c_int, #[pyarg(any, default = 0o600)] @@ -37,7 +39,7 @@ mod _posixshmem { } #[pyfunction] - fn shm_unlink(name: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { + fn shm_unlink(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult<()> { let name = CString::new(name.as_str()).map_err(|e| e.into_pyexception(vm))?; // SAFETY: `name` is a valid NUL-terminated string and `shm_unlink` only reads it. let ret = unsafe { libc::shm_unlink(name.as_ptr()) }; diff --git a/crates/stdlib/src/pyexpat.rs b/crates/stdlib/src/pyexpat.rs index 7d603c72ed5..40418f54c30 100644 --- a/crates/stdlib/src/pyexpat.rs +++ b/crates/stdlib/src/pyexpat.rs @@ -41,7 +41,7 @@ macro_rules! create_bool_property { mod _pyexpat { use crate::vm::{ Context, Py, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, - builtins::{PyBytesRef, PyException, PyModule, PyStr, PyStrRef, PyType}, + builtins::{PyBytesRef, PyException, PyModule, PyStr, PyStrRef, PyType, PyUtf8StrRef}, extend_module, function::{ArgBytesLike, Either, IntoFuncArgs, OptionalArg}, types::Constructor, @@ -390,7 +390,7 @@ mod _pyexpat { #[pyarg(any, optional)] encoding: Option, #[pyarg(any, optional)] - namespace_separator: Option, + namespace_separator: Option, #[pyarg(any, optional)] intern: Option, } @@ -403,11 +403,9 @@ mod _pyexpat { // Validate namespace_separator: must be at most one character let ns_sep = match args.namespace_separator { Some(ref s) => { - let chars: Vec = s.as_str().chars().collect(); - if chars.len() > 1 { + if s.as_str().chars().count() > 1 { return Err(vm.new_value_error( - "namespace_separator must be at most one character, omitted, or None" - .to_owned(), + "namespace_separator must be at most one character, omitted, or None", )); } Some(s.as_str().to_owned()) diff --git a/crates/stdlib/src/pystruct.rs b/crates/stdlib/src/pystruct.rs index d3be417edb3..9c5d67f396c 100644 --- a/crates/stdlib/src/pystruct.rs +++ b/crates/stdlib/src/pystruct.rs @@ -19,6 +19,7 @@ pub(crate) mod _struct { types::{Constructor, IterNext, Iterable, Representable, SelfIter}, }; use crossbeam_utils::atomic::AtomicCell; + use rustpython_common::wtf8::{Wtf8Buf, wtf8_concat}; #[derive(Traverse)] struct IntoStructFormatBytes(PyStrRef); @@ -305,8 +306,8 @@ pub(crate) mod _struct { impl Representable for PyStruct { #[inline] - fn repr_str(zelf: &Py, _vm: &VirtualMachine) -> PyResult { - Ok(format!("Struct('{}')", zelf.format.as_str())) + fn repr_wtf8(zelf: &Py, _vm: &VirtualMachine) -> PyResult { + Ok(wtf8_concat!("Struct('", zelf.format.as_wtf8(), "')")) } } diff --git a/crates/stdlib/src/socket.rs b/crates/stdlib/src/socket.rs index 8c307eb54ec..dce1f27d1ce 100644 --- a/crates/stdlib/src/socket.rs +++ b/crates/stdlib/src/socket.rs @@ -12,6 +12,7 @@ mod _socket { AsObject, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, builtins::{ PyBaseExceptionRef, PyListRef, PyModule, PyOSError, PyStrRef, PyTupleRef, PyTypeRef, + PyUtf8StrRef, }, common::os::ErrorExt, convert::{IntoPyException, ToPyObject, TryFromBorrowedObject, TryFromObject}, @@ -1224,6 +1225,7 @@ mod _socket { obj.class().name() )) })?; + let interface = interface.try_into_utf8(vm).map_err(IoOrPyException::from)?; let ifname = interface.as_str(); // Get interface index @@ -1291,6 +1293,8 @@ mod _socket { )) })?; + let alg_type = alg_type.try_into_utf8(vm).map_err(IoOrPyException::from)?; + let alg_name = alg_name.try_into_utf8(vm).map_err(IoOrPyException::from)?; let type_str = alg_type.as_str(); let name_str = alg_name.as_str(); @@ -2134,7 +2138,7 @@ mod _socket { && let Ok(addr) = sock.local_addr() && let Ok(repr) = get_addr_tuple(&addr, vm).repr(vm) { - format!(", laddr={}", repr.as_str()) + format!(", laddr={}", repr.as_wtf8()) } else { String::new() }; @@ -2484,7 +2488,7 @@ mod _socket { } struct Address { - host: PyStrRef, + host: PyUtf8StrRef, port: u16, } @@ -2509,6 +2513,7 @@ mod _socket { impl Address { fn from_tuple(tuple: &[PyObjectRef], vm: &VirtualMachine) -> PyResult { let host = PyStrRef::try_from_object(vm, tuple[0].clone())?; + let host = host.try_into_utf8(vm)?; let port = i32::try_from_borrowed_object(vm, &tuple[1])?; let port = port .to_u16() @@ -2625,12 +2630,12 @@ mod _socket { #[cfg(all(unix, not(target_os = "redox")))] #[pyfunction] - fn sethostname(hostname: PyStrRef) -> nix::Result<()> { + fn sethostname(hostname: PyUtf8StrRef) -> nix::Result<()> { nix::unistd::sethostname(hostname.as_str()) } #[pyfunction] - fn inet_aton(ip_string: PyStrRef, vm: &VirtualMachine) -> PyResult> { + fn inet_aton(ip_string: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { ip_string .as_str() .parse::() @@ -2904,7 +2909,7 @@ mod _socket { #[pyfunction] fn gethostbyaddr( - addr: PyStrRef, + addr: PyUtf8StrRef, vm: &VirtualMachine, ) -> Result<(String, PyListRef, PyListRef), IoOrPyException> { let addr = get_addr(vm, addr, c::AF_UNSPEC)?; @@ -2919,7 +2924,7 @@ mod _socket { } #[pyfunction] - fn gethostbyname(name: PyStrRef, vm: &VirtualMachine) -> Result { + fn gethostbyname(name: PyUtf8StrRef, vm: &VirtualMachine) -> Result { let addr = get_addr(vm, name, c::AF_INET)?; match addr { SocketAddr::V4(ip) => Ok(ip.ip().to_string()), @@ -2929,7 +2934,7 @@ mod _socket { #[pyfunction] fn gethostbyname_ex( - name: PyStrRef, + name: PyUtf8StrRef, vm: &VirtualMachine, ) -> Result<(String, PyListRef, PyListRef), IoOrPyException> { let addr = get_addr(vm, name, c::AF_INET)?; @@ -2944,7 +2949,7 @@ mod _socket { } #[pyfunction] - fn inet_pton(af_inet: i32, ip_string: PyStrRef, vm: &VirtualMachine) -> PyResult> { + fn inet_pton(af_inet: i32, ip_string: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { static ERROR_MSG: &str = "illegal IP address string passed to inet_pton"; let ip_addr = match af_inet { c::AF_INET => ip_string @@ -3015,10 +3020,10 @@ mod _socket { protocol: 0, }; let service = addr.port.to_string(); - let mut res = - dns_lookup::getaddrinfo(Some(addr.host.as_str()), Some(&service), Some(hints)) - .map_err(|e| convert_socket_error(vm, e, SocketError::GaiError))? - .filter_map(Result::ok); + let host_str = addr.host.as_str(); + let mut res = dns_lookup::getaddrinfo(Some(host_str), Some(&service), Some(hints)) + .map_err(|e| convert_socket_error(vm, e, SocketError::GaiError))? + .filter_map(Result::ok); let mut ainfo = res.next().unwrap(); if res.next().is_some() { return Err(vm @@ -3183,7 +3188,7 @@ mod _socket { fn get_addr( vm: &VirtualMachine, - pyname: PyStrRef, + pyname: PyUtf8StrRef, af: i32, ) -> Result { let name = pyname.as_str(); @@ -3236,7 +3241,7 @@ mod _socket { let name = vm .state .codec_registry - .encode_text(pyname, "idna", None, vm)?; + .encode_text(pyname.into_wtf8(), "idna", None, vm)?; let name = core::str::from_utf8(name.as_bytes()) .map_err(|_| vm.new_runtime_error("idna output is not utf8"))?; let mut res = dns_lookup::getaddrinfo(Some(name), None, Some(hints)) diff --git a/crates/stdlib/src/ssl.rs b/crates/stdlib/src/ssl.rs index a1b089078a8..fefca04443f 100644 --- a/crates/stdlib/src/ssl.rs +++ b/crates/stdlib/src/ssl.rs @@ -40,7 +40,10 @@ mod _ssl { vm::{ AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, - builtins::{PyBaseExceptionRef, PyBytesRef, PyListRef, PyStrRef, PyType, PyTypeRef}, + builtins::{ + PyBaseExceptionRef, PyBytesRef, PyListRef, PyStrRef, PyType, PyTypeRef, + PyUtf8StrRef, + }, convert::IntoPyException, function::{ ArgBytesLike, ArgMemoryBuffer, Either, FuncArgs, OptionalArg, PyComparisonValue, @@ -57,10 +60,16 @@ mod _ssl { }; use alloc::sync::Arc; use core::{ + hash::{Hash, Hasher}, sync::atomic::{AtomicUsize, Ordering}, time::Duration, }; - use std::{collections::HashMap, time::SystemTime}; + use rustls::crypto::aws_lc_rs::ALL_CIPHER_SUITES; + use std::{ + collections::{HashMap, hash_map::DefaultHasher}, + io::BufRead, + time::SystemTime, + }; // Rustls imports use parking_lot::{Mutex as ParkingMutex, RwLock as ParkingRwLock}; @@ -597,8 +606,6 @@ mod _ssl { /// - "ALL" or "DEFAULT" → all available /// - "!MD5" → exclusion (ignored, rustls doesn't support weak ciphers anyway) fn parse_cipher_string(cipher_str: &str) -> Result, String> { - use rustls::crypto::aws_lc_rs::ALL_CIPHER_SUITES; - if cipher_str.is_empty() { return Err("No cipher can be selected".to_string()); } @@ -801,7 +808,7 @@ mod _ssl { sock: PyObjectRef, server_side: bool, #[pyarg(positional, optional)] - server_hostname: OptionalArg>, + server_hostname: OptionalArg>, #[pyarg(named, optional)] owner: OptionalArg, #[pyarg(named, optional)] @@ -815,7 +822,7 @@ mod _ssl { #[pyarg(named, optional)] server_side: OptionalArg, #[pyarg(named, optional)] - server_hostname: OptionalArg>, + server_hostname: OptionalArg>, #[pyarg(named, optional)] owner: OptionalArg, #[pyarg(named, optional)] @@ -1080,7 +1087,7 @@ mod _ssl { // Convert callable result to string let password_from_callable = if let Ok(pwd_str) = - PyStrRef::try_from_object(vm, pwd_result.clone()) + PyUtf8StrRef::try_from_object(vm, pwd_result.clone()) { pwd_str.as_str().to_owned() } else if let Ok(pwd_bytes_like) = ArgBytesLike::try_from_object(vm, pwd_result) { @@ -1501,7 +1508,7 @@ mod _ssl { } #[pymethod] - fn set_ciphers(&self, ciphers: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { + fn set_ciphers(&self, ciphers: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult<()> { let cipher_str = ciphers.as_str(); // Parse cipher string and store selected ciphers @@ -1520,7 +1527,6 @@ mod _ssl { fn get_ciphers(&self, vm: &VirtualMachine) -> PyResult { // Dynamically generate cipher list from rustls ALL_CIPHER_SUITES // This automatically includes all cipher suites supported by the current rustls version - use rustls::crypto::aws_lc_rs::ALL_CIPHER_SUITES; let cipher_list = ALL_CIPHER_SUITES .iter() @@ -1741,7 +1747,7 @@ mod _ssl { } // Validate filepath is str or bytes - let path_str = if let Ok(s) = PyStrRef::try_from_object(vm, filepath.clone()) { + let path_str = if let Ok(s) = PyUtf8StrRef::try_from_object(vm, filepath.clone()) { s.as_str().to_owned() } else if let Ok(b) = ArgBytesLike::try_from_object(vm, filepath) { String::from_utf8(b.borrow_buf().to_vec()) @@ -1796,7 +1802,7 @@ mod _ssl { } // Validate name is str or bytes - let curve_name = if let Ok(s) = PyStrRef::try_from_object(vm, name.clone()) { + let curve_name = if let Ok(s) = PyUtf8StrRef::try_from_object(vm, name.clone()) { s.as_str().to_owned() } else if let Ok(b) = ArgBytesLike::try_from_object(vm, name) { String::from_utf8(b.borrow_buf().to_vec()) @@ -1995,7 +2001,7 @@ mod _ssl { vm: &VirtualMachine, ) -> PyResult { match arg { - Either::A(s) => Ok(s.as_str().to_owned()), + Either::A(s) => Ok(s.clone().try_into_utf8(vm)?.as_str().to_owned()), Either::B(b) => String::from_utf8(b.borrow_buf().to_vec()) .map_err(|_| vm.new_value_error("path contains invalid UTF-8".to_owned())), } @@ -2013,7 +2019,7 @@ mod _ssl { match password { OptionalArg::Present(p) => { // Try string first - if let Ok(pwd_str) = PyStrRef::try_from_object(vm, p.clone()) { + if let Ok(pwd_str) = PyUtf8StrRef::try_from_object(vm, p.clone()) { Ok((Some(pwd_str.as_str().to_owned()), None)) } // Try bytes-like @@ -2171,10 +2177,10 @@ mod _ssl { fn parse_cadata_arg( &self, arg: &Either, - _vm: &VirtualMachine, + vm: &VirtualMachine, ) -> PyResult> { match arg { - Either::A(s) => Ok(s.as_str().as_bytes().to_vec()), + Either::A(s) => Ok(s.clone().try_into_utf8(vm)?.as_str().as_bytes().to_vec()), Either::B(b) => Ok(b.borrow_buf().to_vec()), } } @@ -3577,7 +3583,7 @@ mod _ssl { Some(conn) => conn, None => return Err(create_ssl_eof_error(vm).upcast()), }; - use std::io::BufRead; + let mut reader = conn.reader(); reader.fill_buf().map(|buf| buf.len()).unwrap_or(0) }; @@ -3606,7 +3612,7 @@ mod _ssl { Some(conn) => conn, None => return Err(create_ssl_zero_return_error(vm).upcast()), }; - use std::io::BufRead; + let mut reader = conn.reader(); reader.fill_buf().map(|buf| buf.len()).unwrap_or(0) }; @@ -3674,7 +3680,6 @@ mod _ssl { // Use rustls Reader's fill_buf() to check buffered plaintext // fill_buf() returns a reference to buffered data without consuming it // This matches OpenSSL's SSL_pending() behavior - use std::io::BufRead; let mut reader = conn.reader(); match reader.fill_buf() { Ok(buf) => Ok(buf.len()), @@ -3894,7 +3899,7 @@ mod _ssl { #[pygetset(setter)] fn set_server_hostname( &self, - value: Option, + value: Option, vm: &VirtualMachine, ) -> PyResult<()> { // Check if handshake is already done @@ -3905,11 +3910,14 @@ mod _ssl { } // Validate hostname - if let Some(hostname_str) = &value { - validate_hostname(hostname_str.as_str(), vm)?; - } + let hostname_string = value + .map(|s| { + validate_hostname(s.as_str(), vm)?; + Ok::(s.as_str().to_owned()) + }) + .transpose()?; - *self.server_hostname.write() = value.map(|s| s.as_str().to_string()); + *self.server_hostname.write() = hostname_string; Ok(()) } @@ -4183,7 +4191,7 @@ mod _ssl { // Timeout reached - raise TimeoutError return Err(vm.new_exception_msg( vm.ctx.exceptions.timeout_error.to_owned(), - "The read operation timed out".to_owned(), + "The read operation timed out".into(), )); } Some(dl - now) @@ -4203,7 +4211,7 @@ mod _ssl { // Raise TimeoutError return Err(vm.new_exception_msg( vm.ctx.exceptions.timeout_error.to_owned(), - "The read operation timed out".to_owned(), + "The read operation timed out".into(), )); } @@ -4529,7 +4537,7 @@ mod _ssl { #[pymethod] fn get_channel_binding( &self, - cb_type: OptionalArg, + cb_type: OptionalArg, vm: &VirtualMachine, ) -> PyResult> { let cb_type_str = cb_type.as_ref().map_or("tls-unique", |s| s.as_str()); @@ -4628,7 +4636,7 @@ mod _ssl { if !is_contiguous { return Err(vm.new_exception_msg( vm.ctx.exceptions.buffer_error.to_owned(), - "non-contiguous buffer is not supported".to_owned(), + "non-contiguous buffer is not supported".into(), )); } } @@ -4724,8 +4732,6 @@ mod _ssl { #[pygetset] fn id(&self, vm: &VirtualMachine) -> PyBytesRef { // Return session ID (hash of session data for uniqueness) - use core::hash::{Hash, Hasher}; - use std::collections::hash_map::DefaultHasher; let mut hasher = DefaultHasher::new(); self.session_data.hash(&mut hasher); @@ -4755,7 +4761,7 @@ mod _ssl { #[derive(FromArgs)] struct Txt2ObjArgs { - txt: PyStrRef, + txt: PyUtf8StrRef, #[pyarg(named, optional)] name: OptionalArg, } @@ -4901,13 +4907,13 @@ mod _ssl { /// This is a simplified wrapper around cert_der_to_dict_helper that handles /// file reading and PEM/DER auto-detection. Used by test suite. #[pyfunction] - fn _test_decode_cert(path: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn _test_decode_cert(path: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // Read certificate file - let cert_data = std::fs::read(path.as_str()).map_err(|e| { + let path_str = path.as_str(); + let cert_data = std::fs::read(path_str).map_err(|e| { vm.new_os_error(format!( "Failed to read certificate file {}: {}", - path.as_str(), - e + path_str, e )) })?; @@ -4944,11 +4950,9 @@ mod _ssl { } #[pyfunction] - fn PEM_cert_to_DER_cert(pem_cert: PyStrRef, vm: &VirtualMachine) -> PyResult { - let pem_str = pem_cert.as_str(); - + fn PEM_cert_to_DER_cert(pem_cert: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // Parse PEM format - let mut cursor = std::io::Cursor::new(pem_str.as_bytes()); + let mut cursor = std::io::Cursor::new(pem_cert.as_bytes()); let mut certs = rustls_pemfile::certs(&mut cursor); if let Some(Ok(cert)) = certs.next() { @@ -4961,22 +4965,27 @@ mod _ssl { // Windows-specific certificate store enumeration functions #[cfg(windows)] #[pyfunction] - fn enum_certificates(store_name: PyStrRef, vm: &VirtualMachine) -> PyResult> { + fn enum_certificates( + store_name: PyUtf8StrRef, + vm: &VirtualMachine, + ) -> PyResult> { use schannel::{RawPointer, cert_context::ValidUses, cert_store::CertStore}; use windows_sys::Win32::Security::Cryptography; + let store_name_str = store_name.as_str(); + // Try both Current User and Local Machine stores let open_fns = [CertStore::open_current_user, CertStore::open_local_machine]; let stores = open_fns .iter() - .filter_map(|open| open(store_name.as_str()).ok()) + .filter_map(|open| open(store_name_str).ok()) .collect::>(); // If no stores could be opened, raise OSError if stores.is_empty() { return Err(vm.new_os_error(format!( "failed to open certificate store {:?}", - store_name.as_str() + store_name_str ))); } @@ -5011,14 +5020,14 @@ mod _ssl { #[cfg(windows)] #[pyfunction] - fn enum_crls(store_name: PyStrRef, vm: &VirtualMachine) -> PyResult> { + fn enum_crls(store_name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { use windows_sys::Win32::Security::Cryptography::{ CRL_CONTEXT, CertCloseStore, CertEnumCRLsInStore, CertOpenSystemStoreW, X509_ASN_ENCODING, }; - let store_name_wide: Vec = store_name - .as_str() + let store_name_str = store_name.as_str(); + let store_name_wide: Vec = store_name_str .encode_utf16() .chain(core::iter::once(0)) .collect(); @@ -5029,7 +5038,7 @@ mod _ssl { if store.is_null() { return Err(vm.new_os_error(format!( "failed to open certificate store {:?}", - store_name.as_str() + store_name_str ))); } @@ -5135,9 +5144,6 @@ mod _ssl { // Implement Hashable trait for PySSLCertificate impl Hashable for PySSLCertificate { fn hash(zelf: &Py, _vm: &VirtualMachine) -> PyResult { - use core::hash::{Hash, Hasher}; - use std::collections::hash_map::DefaultHasher; - let mut hasher = DefaultHasher::new(); zelf.der_bytes.hash(&mut hasher); Ok(hasher.finish() as PyHash) diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index a575c1ae7e5..5664fd0c36e 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -20,14 +20,12 @@ enum NormalizeForm { impl<'a> TryFromBorrowedObject<'a> for NormalizeForm { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { obj.try_value_with( - |form: &PyStr| { - Ok(match form.as_str() { - "NFC" => Self::Nfc, - "NFKC" => Self::Nfkc, - "NFD" => Self::Nfd, - "NFKD" => Self::Nfkd, - _ => return Err(vm.new_value_error("invalid normalization form")), - }) + |form: &PyStr| match form.as_bytes() { + b"NFC" => Ok(Self::Nfc), + b"NFKC" => Ok(Self::Nfkc), + b"NFD" => Ok(Self::Nfd), + b"NFKD" => Ok(Self::Nfkd), + _ => Err(vm.new_value_error("invalid normalization form")), }, vm, ) @@ -36,6 +34,7 @@ impl<'a> TryFromBorrowedObject<'a> for NormalizeForm { #[pymodule] mod unicodedata { + use super::NormalizeForm::*; use crate::vm::{ Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, builtins::{PyModule, PyStrRef}, @@ -126,7 +125,8 @@ mod unicodedata { #[pymethod] fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult { - if let Some(character) = unicode_names2::character(name.as_str()) + if let Some(name_str) = name.to_str() + && let Some(character) = unicode_names2::character(name_str) && self.check_age(character.into()) { return Ok(character.to_string()); @@ -188,7 +188,6 @@ mod unicodedata { #[pymethod] fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - use super::NormalizeForm::*; let text = unistr.as_wtf8(); let normalized_text = match form { Nfc => text.map_utf8(|s| s.nfc()).collect(), @@ -201,7 +200,6 @@ mod unicodedata { #[pymethod] fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - use super::NormalizeForm::*; let text = unistr.as_wtf8(); let normalized: Wtf8Buf = match form { Nfc => text.map_utf8(|s| s.nfc()).collect(), diff --git a/crates/stdlib/src/zlib.rs b/crates/stdlib/src/zlib.rs index 40269f12bbf..35a617ed152 100644 --- a/crates/stdlib/src/zlib.rs +++ b/crates/stdlib/src/zlib.rs @@ -466,7 +466,8 @@ mod zlib { } fn new_zlib_error(message: impl Into, vm: &VirtualMachine) -> PyBaseExceptionRef { - vm.new_exception_msg(vm.class("zlib", "error"), message.into()) + let msg: String = message.into(); + vm.new_exception_msg(vm.class("zlib", "error"), msg.into()) } struct Level(Option); diff --git a/crates/vm/src/buffer.rs b/crates/vm/src/buffer.rs index 33670f1c30a..db58c909bca 100644 --- a/crates/vm/src/buffer.rs +++ b/crates/vm/src/buffer.rs @@ -725,5 +725,6 @@ pub fn struct_error_type(vm: &VirtualMachine) -> &'static PyTypeRef { pub fn new_struct_error(vm: &VirtualMachine, msg: impl Into) -> PyBaseExceptionRef { // can't just STRUCT_ERROR.get().unwrap() cause this could be called before from buffer // machinery, independent of whether _struct was ever imported + let msg: String = msg.into(); vm.new_exception_msg(struct_error_type(vm).clone(), msg.into()) } diff --git a/crates/vm/src/builtins/bool.rs b/crates/vm/src/builtins/bool.rs index 24ded08ab10..dff2e642b93 100644 --- a/crates/vm/src/builtins/bool.rs +++ b/crates/vm/src/builtins/bool.rs @@ -1,4 +1,4 @@ -use super::{PyInt, PyStrRef, PyType, PyTypeRef}; +use super::{PyInt, PyStrRef, PyType, PyTypeRef, PyUtf8StrRef}; use crate::common::format::FormatSpec; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyResult, TryFromBorrowedObject, VirtualMachine, @@ -101,7 +101,7 @@ impl Constructor for PyBool { #[pyclass(with(Constructor, AsNumber, Representable), flags(_MATCH_SELF))] impl PyBool { #[pymethod] - fn __format__(obj: PyObjectRef, spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn __format__(obj: PyObjectRef, spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { let new_bool = obj.try_to_bool(vm)?; FormatSpec::parse(spec.as_str()) .and_then(|format_spec| format_spec.format_bool(new_bool)) diff --git a/crates/vm/src/builtins/classmethod.rs b/crates/vm/src/builtins/classmethod.rs index d2f1377be04..22df6f27475 100644 --- a/crates/vm/src/builtins/classmethod.rs +++ b/crates/vm/src/builtins/classmethod.rs @@ -211,11 +211,11 @@ impl Representable for PyClassMethod { class .__qualname__(vm) .downcast_ref::() - .map(|n| n.as_str()), + .map(|n| n.as_wtf8()), class .__module__(vm) .downcast_ref::() - .map(|m| m.as_str()), + .map(|m| m.as_wtf8()), ) { (None, _) => return Err(vm.new_type_error("Unknown qualified name")), (Some(qualname), Some(module)) if module != "builtins" => { diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs index 790f3fd8695..fcaff687946 100644 --- a/crates/vm/src/builtins/code.rs +++ b/crates/vm/src/builtins/code.rs @@ -459,7 +459,7 @@ impl Constructor for PyCode { let s = obj.downcast_ref::().ok_or_else(|| { vm.new_type_error("names must be tuple of strings".to_owned()) })?; - Ok(vm.ctx.intern_str(s.as_str())) + Ok(vm.ctx.intern_str(s.as_wtf8())) }) .collect::>>()? .into_boxed_slice(); @@ -471,7 +471,7 @@ impl Constructor for PyCode { let s = obj.downcast_ref::().ok_or_else(|| { vm.new_type_error("varnames must be tuple of strings".to_owned()) })?; - Ok(vm.ctx.intern_str(s.as_str())) + Ok(vm.ctx.intern_str(s.as_wtf8())) }) .collect::>>()? .into_boxed_slice(); @@ -483,7 +483,7 @@ impl Constructor for PyCode { let s = obj.downcast_ref::().ok_or_else(|| { vm.new_type_error("cellvars must be tuple of strings".to_owned()) })?; - Ok(vm.ctx.intern_str(s.as_str())) + Ok(vm.ctx.intern_str(s.as_wtf8())) }) .collect::>>()? .into_boxed_slice(); @@ -495,7 +495,7 @@ impl Constructor for PyCode { let s = obj.downcast_ref::().ok_or_else(|| { vm.new_type_error("freevars must be tuple of strings".to_owned()) })?; - Ok(vm.ctx.intern_str(s.as_str())) + Ok(vm.ctx.intern_str(s.as_wtf8())) }) .collect::>>()? .into_boxed_slice(); @@ -551,15 +551,15 @@ impl Constructor for PyCode { posonlyarg_count: args.posonlyargcount, arg_count: args.argcount, kwonlyarg_count: args.kwonlyargcount, - source_path: vm.ctx.intern_str(args.filename.as_str()), + source_path: vm.ctx.intern_str(args.filename.as_wtf8()), first_line_number: if args.firstlineno > 0 { OneIndexed::new(args.firstlineno as usize) } else { None }, max_stackdepth: args.stacksize, - obj_name: vm.ctx.intern_str(args.name.as_str()), - qualname: vm.ctx.intern_str(args.qualname.as_str()), + obj_name: vm.ctx.intern_str(args.name.as_wtf8()), + qualname: vm.ctx.intern_str(args.qualname.as_wtf8()), cell2arg: None, // TODO: reuse `fn cell2arg` constants, names, diff --git a/crates/vm/src/builtins/complex.rs b/crates/vm/src/builtins/complex.rs index dd68061557d..ab656c974ee 100644 --- a/crates/vm/src/builtins/complex.rs +++ b/crates/vm/src/builtins/complex.rs @@ -1,9 +1,9 @@ use super::{PyStr, PyType, PyTypeRef, float}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, - builtins::PyStrRef, + builtins::PyUtf8StrRef, class::PyClassImpl, - common::format::FormatSpec, + common::{format::FormatSpec, wtf8::Wtf8Buf}, convert::{IntoPyException, ToPyObject, ToPyResult}, function::{FuncArgs, OptionalArg, PyComparisonValue}, protocol::PyNumberMethods, @@ -276,13 +276,14 @@ impl PyComplex { } #[pymethod] - fn __format__(zelf: &Py, spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn __format__(zelf: &Py, spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // Empty format spec: equivalent to str(self) if spec.is_empty() { - return Ok(zelf.as_object().str(vm)?.as_str().to_owned()); + return Ok(zelf.as_object().str(vm)?.as_wtf8().to_owned()); } FormatSpec::parse(spec.as_str()) .and_then(|format_spec| format_spec.format_complex(&zelf.value)) + .map(Wtf8Buf::from_string) .map_err(|err| err.into_pyexception(vm)) } } diff --git a/crates/vm/src/builtins/dict.rs b/crates/vm/src/builtins/dict.rs index 1d79a5a5906..0cc9ee66f3f 100644 --- a/crates/vm/src/builtins/dict.rs +++ b/crates/vm/src/builtins/dict.rs @@ -27,6 +27,7 @@ use crate::{ }; use alloc::fmt; use rustpython_common::lock::PyMutex; +use rustpython_common::wtf8::Wtf8Buf; pub type DictContentType = dict_inner::Dict; @@ -543,14 +544,19 @@ impl Representable for PyDict { #[inline] fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult { let s = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - let mut str_parts = Vec::with_capacity(zelf.__len__()); + let mut result = Wtf8Buf::from("{"); + let mut first = true; for (key, value) in zelf { - let key_repr = &key.repr(vm)?; - let value_repr = value.repr(vm)?; - str_parts.push(format!("{key_repr}: {value_repr}")); + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(key.repr(vm)?.as_wtf8()); + result.push_str(": "); + result.push_wtf8(value.repr(vm)?.as_wtf8()); } - - vm.ctx.new_str(format!("{{{}}}", str_parts.join(", "))) + result.push_char('}'); + vm.ctx.new_str(result) } else { vm.ctx.intern_str("{...}").to_owned() }; @@ -848,13 +854,17 @@ macro_rules! dict_view { #[inline] fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult { let s = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - let mut str_parts = Vec::with_capacity(zelf.__len__()); + let mut result = Wtf8Buf::from(format!("{}([", Self::NAME)); + let mut first = true; for (key, value) in zelf.dict().clone() { - let s = &Self::item(vm, key, value).repr(vm)?; - str_parts.push(s.as_str().to_owned()); + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(Self::item(vm, key, value).repr(vm)?.as_wtf8()); } - vm.ctx - .new_str(format!("{}([{}])", Self::NAME, str_parts.join(", "))) + result.push_str("])"); + vm.ctx.new_str(result) } else { vm.ctx.intern_str("{...}").to_owned() }; diff --git a/crates/vm/src/builtins/float.rs b/crates/vm/src/builtins/float.rs index 89e42ec0f39..23d04d36998 100644 --- a/crates/vm/src/builtins/float.rs +++ b/crates/vm/src/builtins/float.rs @@ -1,11 +1,12 @@ use super::{ - PyByteArray, PyBytes, PyInt, PyIntRef, PyStr, PyStrRef, PyType, PyTypeRef, try_bigint_to_f64, + PyByteArray, PyBytes, PyInt, PyIntRef, PyStr, PyType, PyTypeRef, PyUtf8StrRef, + try_bigint_to_f64, }; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromBorrowedObject, TryFromObject, VirtualMachine, class::PyClassImpl, - common::{float_ops, format::FormatSpec, hash}, + common::{float_ops, format::FormatSpec, hash, wtf8::Wtf8Buf}, convert::{IntoPyException, ToPyObject, ToPyResult}, function::{ ArgBytesLike, FuncArgs, OptionalArg, OptionalOption, PyArithmeticValue::*, @@ -214,18 +215,19 @@ fn float_from_string(val: PyObjectRef, vm: &VirtualMachine) -> PyResult { )] impl PyFloat { #[pymethod] - fn __format__(zelf: &Py, spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn __format__(zelf: &Py, spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // Empty format spec: equivalent to str(self) if spec.is_empty() { - return Ok(zelf.as_object().str(vm)?.as_str().to_owned()); + return Ok(zelf.as_object().str(vm)?.as_wtf8().to_owned()); } FormatSpec::parse(spec.as_str()) .and_then(|format_spec| format_spec.format_float(zelf.value)) + .map(Wtf8Buf::from_string) .map_err(|err| err.into_pyexception(vm)) } #[pystaticmethod] - fn __getformat__(spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn __getformat__(spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { if !matches!(spec.as_str(), "double" | "float") { return Err( vm.new_value_error("__getformat__() argument 1 must be 'double' or 'float'") @@ -340,7 +342,7 @@ impl PyFloat { } #[pyclassmethod] - fn fromhex(cls: PyTypeRef, string: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn fromhex(cls: PyTypeRef, string: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { let result = crate::literal::float::from_hex(string.as_str().trim()) .ok_or_else(|| vm.new_value_error("invalid hexadecimal floating-point string"))?; PyType::call(&cls, vec![vm.ctx.new_float(result).into()].into(), vm) diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 43615711e5d..f8ff44e8a72 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -14,6 +14,7 @@ use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, bytecode, class::PyClassImpl, + common::wtf8::{Wtf8Buf, wtf8_concat}, frame::Frame, function::{FuncArgs, OptionalArg, PyComparisonValue, PySetterValue}, scope::Scope, @@ -1074,28 +1075,6 @@ impl PyBoundMethod { fn __module__(&self, vm: &VirtualMachine) -> Option { self.function.get_attr("__module__", vm).ok() } - - #[pygetset] - fn __qualname__(&self, vm: &VirtualMachine) -> PyResult { - if self - .function - .fast_isinstance(vm.ctx.types.builtin_function_or_method_type) - { - // Special case: we work with `__new__`, which is not really a method. - // It is a function, so its `__qualname__` is just `__new__`. - // We need to add object's part manually. - let obj_name = vm.get_attribute_opt(self.object.clone(), "__qualname__")?; - let obj_name: Option = obj_name.and_then(|o| o.downcast().ok()); - return Ok(vm - .ctx - .new_str(format!( - "{}.__new__", - obj_name.as_ref().map_or("?", |s| s.as_str()) - )) - .into()); - } - self.function.get_attr("__qualname__", vm) - } } impl PyPayload for PyBoundMethod { @@ -1107,21 +1086,23 @@ impl PyPayload for PyBoundMethod { impl Representable for PyBoundMethod { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let func_name = - if let Some(qname) = vm.get_attribute_opt(zelf.function.clone(), "__qualname__")? { - Some(qname) - } else { - vm.get_attribute_opt(zelf.function.clone(), "__name__")? - }; - let func_name: Option = func_name.and_then(|o| o.downcast().ok()); - let formatted_func_name = match func_name { - Some(name) => name.to_string(), - None => "?".to_string(), + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let func_name = if let Some(qname) = + vm.get_attribute_opt(zelf.function.clone(), identifier!(vm, __qualname__))? + { + Some(qname) + } else { + vm.get_attribute_opt(zelf.function.clone(), identifier!(vm, __name__))? }; + let func_name: Option = func_name.and_then(|o| o.downcast().ok()); let object_repr = zelf.object.repr(vm)?; - Ok(format!( - "", + let name = func_name.as_ref().map_or("?".as_ref(), |s| s.as_wtf8()); + Ok(wtf8_concat!( + "" )) } } diff --git a/crates/vm/src/builtins/function/jit.rs b/crates/vm/src/builtins/function/jit.rs index a28335900da..9d3803759cf 100644 --- a/crates/vm/src/builtins/function/jit.rs +++ b/crates/vm/src/builtins/function/jit.rs @@ -41,7 +41,7 @@ impl ToPyObject for AbiValue { pub fn new_jit_error(msg: String, vm: &VirtualMachine) -> PyBaseExceptionRef { let jit_error = vm.ctx.exceptions.jit_error.to_owned(); - vm.new_exception_msg(jit_error, msg) + vm.new_exception_msg(jit_error, msg.into()) } fn get_jit_arg_type(dict: &Py, name: &str, vm: &VirtualMachine) -> PyResult { diff --git a/crates/vm/src/builtins/genericalias.rs b/crates/vm/src/builtins/genericalias.rs index 8aabca1ae10..678fca606ff 100644 --- a/crates/vm/src/builtins/genericalias.rs +++ b/crates/vm/src/builtins/genericalias.rs @@ -635,14 +635,14 @@ impl Hashable for PyGenericAlias { impl GetAttr for PyGenericAlias { fn getattro(zelf: &Py, attr: &Py, vm: &VirtualMachine) -> PyResult { - let attr_str = attr.as_str(); + let attr_str = attr.as_wtf8(); for exc in &ATTR_EXCEPTIONS { - if *exc == attr_str { + if attr_str == *exc { return zelf.as_object().generic_getattr(attr, vm); } } for blocked in &ATTR_BLOCKED { - if *blocked == attr_str { + if attr_str == *blocked { return zelf.as_object().generic_getattr(attr, vm); } } diff --git a/crates/vm/src/builtins/int.rs b/crates/vm/src/builtins/int.rs index bbbc7d17673..0bf7372bbb3 100644 --- a/crates/vm/src/builtins/int.rs +++ b/crates/vm/src/builtins/int.rs @@ -2,13 +2,14 @@ use super::{PyByteArray, PyBytes, PyStr, PyType, PyTypeRef, float}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult, TryFromBorrowedObject, VirtualMachine, - builtins::PyStrRef, + builtins::PyUtf8StrRef, bytes_inner::PyBytesInner, class::PyClassImpl, common::{ format::FormatSpec, hash, int::{bigint_to_finite_float, bytes_to_int, true_div}, + wtf8::Wtf8Buf, }, convert::{IntoPyException, ToPyObject, ToPyResult}, function::{ @@ -197,7 +198,7 @@ fn inner_truediv(i1: &BigInt, i2: &BigInt, vm: &VirtualMachine) -> PyResult { if float.is_infinite() { Err(vm.new_exception_msg( vm.ctx.exceptions.overflow_error.to_owned(), - "integer division result too large for a float".to_owned(), + "integer division result too large for a float".into(), )) } else { Ok(vm.ctx.new_float(float).into()) @@ -444,13 +445,14 @@ impl PyInt { } #[pymethod] - fn __format__(zelf: &Py, spec: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn __format__(zelf: &Py, spec: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { // Empty format spec on a subclass: equivalent to str(self) if spec.is_empty() && !zelf.class().is(vm.ctx.types.int_type) { - return Ok(zelf.as_object().str(vm)?.as_str().to_owned()); + return Ok(zelf.as_object().str(vm)?.as_wtf8().to_owned()); } FormatSpec::parse(spec.as_str()) .and_then(|format_spec| format_spec.format_int(&zelf.value)) + .map(Wtf8Buf::from_string) .map_err(|err| err.into_pyexception(vm)) } diff --git a/crates/vm/src/builtins/interpolation.rs b/crates/vm/src/builtins/interpolation.rs index afdce51be9b..18fd10ba022 100644 --- a/crates/vm/src/builtins/interpolation.rs +++ b/crates/vm/src/builtins/interpolation.rs @@ -10,6 +10,8 @@ use crate::{ function::{OptionalArg, PyComparisonValue}, types::{Comparable, Constructor, Hashable, PyComparisonOp, Representable}, }; +use itertools::Itertools; +use rustpython_common::wtf8::Wtf8Buf; /// Interpolation object for t-strings (PEP 750). /// @@ -42,11 +44,11 @@ impl PyInterpolation { let is_valid = vm.is_none(&conversion) || conversion .downcast_ref::() - .is_some_and(|s| matches!(s.as_str(), "s" | "r" | "a")); + .is_some_and(|s| matches!(s.to_str(), Some("s") | Some("r") | Some("a"))); if !is_valid { return Err(vm.new_exception_msg( vm.ctx.exceptions.system_error.to_owned(), - "Interpolation() argument 'conversion' must be one of 's', 'a' or 'r'".to_owned(), + "Interpolation() argument 'conversion' must be one of 's', 'a' or 'r'".into(), )); } Ok(Self { @@ -63,8 +65,13 @@ impl Constructor for PyInterpolation { fn py_new(_cls: &Py, args: Self::Args, vm: &VirtualMachine) -> PyResult { let conversion: PyObjectRef = if let Some(s) = args.conversion { - let s_str = s.as_str(); - if s_str.len() != 1 || !matches!(s_str.chars().next(), Some('s' | 'r' | 'a')) { + let has_flag = s + .as_bytes() + .iter() + .exactly_one() + .ok() + .is_some_and(|s| matches!(*s, b's' | b'r' | b'a')); + if !has_flag { return Err(vm.new_value_error( "Interpolation() argument 'conversion' must be one of 's', 'a' or 'r'", )); @@ -194,25 +201,26 @@ impl Hashable for PyInterpolation { impl Representable for PyInterpolation { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let value_repr = zelf.value.repr(vm)?; let expr_repr = zelf.expression.repr(vm)?; + let spec_repr = zelf.format_spec.repr(vm)?; - let conv_str = if vm.is_none(&zelf.conversion) { - "None".to_owned() + let mut result = Wtf8Buf::from("Interpolation("); + result.push_wtf8(value_repr.as_wtf8()); + result.push_str(", "); + result.push_str(&expr_repr); + result.push_str(", "); + if vm.is_none(&zelf.conversion) { + result.push_str("None"); } else { - zelf.conversion.repr(vm)?.as_str().to_owned() - }; - - let spec_repr = zelf.format_spec.repr(vm)?; + result.push_wtf8(zelf.conversion.repr(vm)?.as_wtf8()); + } + result.push_str(", "); + result.push_str(&spec_repr); + result.push_char(')'); - Ok(format!( - "Interpolation({}, {}, {}, {})", - value_repr.as_str(), - expr_repr.as_str(), - conv_str, - spec_repr.as_str() - )) + Ok(result) } } diff --git a/crates/vm/src/builtins/list.rs b/crates/vm/src/builtins/list.rs index 7e22f73f8ec..f74d4c4d62f 100644 --- a/crates/vm/src/builtins/list.rs +++ b/crates/vm/src/builtins/list.rs @@ -9,6 +9,7 @@ use crate::common::lock::{ use crate::object::{Traverse, TraverseFn}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, + builtins::PyStr, class::PyClassImpl, convert::ToPyObject, function::{ArgSize, FuncArgs, OptionalArg, PyComparisonValue}, @@ -516,19 +517,25 @@ impl Comparable for PyList { impl Representable for PyList { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let s = if zelf.__len__() == 0 { - "[]".to_owned() - } else if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult> { + if zelf.__len__() == 0 { + return Ok(vm.ctx.intern_str("[]").to_owned()); + } + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { // Clone elements before calling repr to release the read lock. // Element repr may mutate the list (e.g., list.clear()), which // needs a write lock and would deadlock if read lock is held. let elements: Vec = zelf.borrow_vec().to_vec(); - collection_repr(None, "[", "]", elements.iter(), vm)? + Ok(vm + .ctx + .new_str(collection_repr(None, "[", "]", elements.iter(), vm)?)) } else { - "[...]".to_owned() - }; - Ok(s) + Ok(vm.ctx.intern_str("[...]").to_owned()) + } + } + + fn repr_str(_zelf: &Py, _vm: &VirtualMachine) -> PyResult { + unreachable!("repr() is overridden directly") } } diff --git a/crates/vm/src/builtins/mappingproxy.rs b/crates/vm/src/builtins/mappingproxy.rs index 11525c3f80a..7c852af081f 100644 --- a/crates/vm/src/builtins/mappingproxy.rs +++ b/crates/vm/src/builtins/mappingproxy.rs @@ -1,10 +1,9 @@ use super::{PyDict, PyDictRef, PyGenericAlias, PyList, PyTuple, PyType, PyTypeRef}; -use crate::common::lock::LazyLock; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, atomic_func, class::PyClassImpl, - common::hash, + common::{hash, lock::LazyLock}, convert::ToPyObject, function::{ArgMapping, OptionalArg, PyComparisonValue}, object::{Traverse, TraverseFn}, @@ -14,6 +13,7 @@ use crate::{ PyComparisonOp, Representable, }, }; +use rustpython_common::wtf8::{Wtf8Buf, wtf8_concat}; #[pyclass(module = false, name = "mappingproxy", traverse)] #[derive(Debug)] @@ -287,9 +287,9 @@ impl Iterable for PyMappingProxy { impl Representable for PyMappingProxy { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let obj = zelf.to_object(vm)?; - Ok(format!("mappingproxy({})", obj.repr(vm)?)) + Ok(wtf8_concat!("mappingproxy(", obj.repr(vm)?.as_wtf8(), ')')) } } diff --git a/crates/vm/src/builtins/memory.rs b/crates/vm/src/builtins/memory.rs index 329a6be6737..f29753c1f84 100644 --- a/crates/vm/src/builtins/memory.rs +++ b/crates/vm/src/builtins/memory.rs @@ -1,6 +1,6 @@ use super::{ PositionIterInternal, PyBytes, PyBytesRef, PyGenericAlias, PyInt, PyListRef, PySlice, PyStr, - PyStrRef, PyTuple, PyTupleRef, PyType, PyTypeRef, iter::builtins_iter, + PyStrRef, PyTuple, PyTupleRef, PyType, PyTypeRef, PyUtf8StrRef, iter::builtins_iter, }; use crate::common::lock::LazyLock; use crate::{ @@ -805,8 +805,9 @@ impl PyMemoryView { Err(vm.new_value_error("memoryview.index(x): x not in memoryview")) } - fn cast_to_1d(&self, format: PyStrRef, vm: &VirtualMachine) -> PyResult { - let format_spec = Self::parse_format(format.as_str(), vm)?; + fn cast_to_1d(&self, format: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { + let format_str = format.as_str(); + let format_spec = Self::parse_format(format_str, vm)?; let itemsize = format_spec.size(); if !self.desc.len.is_multiple_of(itemsize) { return Err(vm.new_type_error("memoryview: length is not a multiple of itemsize")); @@ -821,7 +822,7 @@ impl PyMemoryView { len: self.desc.len, readonly: self.desc.readonly, itemsize, - format: format.to_string().into(), + format: format_str.to_owned().into(), dim_desc: vec![(self.desc.len / itemsize, itemsize as isize, 0)], }, hash: OnceCell::new(), @@ -956,7 +957,7 @@ impl Py { #[derive(FromArgs)] struct CastArgs { #[pyarg(any)] - format: PyStrRef, + format: PyUtf8StrRef, #[pyarg(any, optional)] shape: OptionalArg>, } diff --git a/crates/vm/src/builtins/mod.rs b/crates/vm/src/builtins/mod.rs index fa7ab1b854e..099787332c9 100644 --- a/crates/vm/src/builtins/mod.rs +++ b/crates/vm/src/builtins/mod.rs @@ -63,7 +63,7 @@ pub(crate) mod bool_; pub use bool_::PyBool; #[path = "str.rs"] pub(crate) mod pystr; -pub use pystr::{PyStr, PyStrInterned, PyStrRef, PyUtf8Str, PyUtf8StrRef}; +pub use pystr::{PyStr, PyStrInterned, PyStrRef, PyUtf8Str, PyUtf8StrInterned, PyUtf8StrRef}; #[path = "super.rs"] pub(crate) mod super_; pub use super_::PySuper; diff --git a/crates/vm/src/builtins/module.rs b/crates/vm/src/builtins/module.rs index a2221fb6b9a..ad90970e7b9 100644 --- a/crates/vm/src/builtins/module.rs +++ b/crates/vm/src/builtins/module.rs @@ -160,9 +160,10 @@ impl Py { .get_item_opt(identifier!(vm, __name__), vm) .ok() .flatten(); - let mod_name_str = mod_name_obj - .as_ref() - .and_then(|n| n.downcast_ref::().map(|s| s.as_str().to_owned())); + let mod_name_str = mod_name_obj.as_ref().and_then(|n| { + n.downcast_ref::() + .map(|s| s.to_string_lossy().into_owned()) + }); // If __name__ is not set or not a string, use a simpler error message let mod_display = match mod_name_str.as_deref() { @@ -424,7 +425,7 @@ impl Initializer for PyModule { .flags .has_feature(crate::types::PyTypeFlags::HAS_DICT) ); - zelf.init_dict(vm.ctx.intern_str(args.name.as_str()), args.doc, vm); + zelf.init_dict(vm.ctx.intern_str(args.name.as_wtf8()), args.doc, vm); Ok(()) } } diff --git a/crates/vm/src/builtins/namespace.rs b/crates/vm/src/builtins/namespace.rs index 2cc1693302a..191fbd804b6 100644 --- a/crates/vm/src/builtins/namespace.rs +++ b/crates/vm/src/builtins/namespace.rs @@ -1,4 +1,4 @@ -use super::{PyTupleRef, PyType, tuple::IntoPyTuple}; +use super::{PyStr, PyTupleRef, PyType, tuple::IntoPyTuple}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, builtins::PyDict, @@ -9,6 +9,7 @@ use crate::{ Comparable, Constructor, DefaultConstructor, Initializer, PyComparisonOp, Representable, }, }; +use rustpython_common::wtf8::Wtf8Buf; /// A simple attribute-based namespace. /// @@ -138,7 +139,7 @@ impl Comparable for PyNamespace { impl Representable for PyNamespace { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let o = zelf.as_object(); let name = if o.class().is(vm.ctx.types.namespace_type) { "namespace".to_owned() @@ -148,16 +149,27 @@ impl Representable for PyNamespace { let repr = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { let dict = zelf.as_object().dict().unwrap(); - let mut parts = Vec::with_capacity(dict.__len__()); + let mut result = Wtf8Buf::from(format!("{name}(")); + let mut first = true; for (key, value) in dict { - let k = key.repr(vm)?; - let key_str = k.as_wtf8(); - let value_repr = value.repr(vm)?; - parts.push(format!("{}={}", &key_str[1..key_str.len() - 1], value_repr)); + let Some(key_str) = key.downcast_ref::() else { + continue; + }; + if key_str.as_wtf8().is_empty() { + continue; + } + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(key_str.as_wtf8()); + result.push_char('='); + result.push_wtf8(value.repr(vm)?.as_wtf8()); } - format!("{}({})", name, parts.join(", ")) + result.push_char(')'); + result } else { - format!("{name}(...)") + Wtf8Buf::from(format!("{name}(...)")) }; Ok(repr) } diff --git a/crates/vm/src/builtins/object.rs b/crates/vm/src/builtins/object.rs index e4c51061685..10b500619eb 100644 --- a/crates/vm/src/builtins/object.rs +++ b/crates/vm/src/builtins/object.rs @@ -1,4 +1,4 @@ -use super::{PyDictRef, PyList, PyStr, PyStrRef, PyType, PyTypeRef}; +use super::{PyDictRef, PyList, PyStr, PyStrRef, PyType, PyTypeRef, PyUtf8StrRef}; use crate::common::hash::PyHash; use crate::types::PyTypeFlags; use crate::{ @@ -82,9 +82,12 @@ impl Constructor for PyBaseObject { if let Some(abs_methods) = cls.get_attr(identifier!(vm, __abstractmethods__)) && let Some(unimplemented_abstract_method_count) = abs_methods.length_opt(vm) { - let methods: Vec = abs_methods.try_to_value(vm)?; - let methods: String = - Itertools::intersperse(methods.iter().map(|name| name.as_str()), "', '").collect(); + let methods: Vec = abs_methods.try_to_value(vm)?; + let methods: String = Itertools::intersperse( + methods.iter().map(|name| name.as_str().to_owned()), + "', '".to_owned(), + ) + .collect(); let unimplemented_abstract_method_count = unimplemented_abstract_method_count?; let name = cls.name().to_string(); @@ -218,7 +221,7 @@ fn object_getstate_default(obj: &PyObject, required: bool, vm: &VirtualMachine) let has_weakref = if let Some(ref ext) = obj.class().heaptype_ext { match &ext.slots { None => true, // Heap type without __slots__ has automatic weakref - Some(slots) => slots.iter().any(|s| s.as_str() == "__weakref__"), + Some(slots) => slots.iter().any(|s| s.as_bytes() == b"__weakref__"), } } else { let weakref_name = vm.ctx.intern_str("__weakref__"); @@ -255,7 +258,7 @@ fn object_getstate_default(obj: &PyObject, required: bool, vm: &VirtualMachine) let Ok(value) = obj.get_attr(name, vm) else { continue; }; - slots.set_item(name.as_str(), value, vm).unwrap(); + slots.set_item(name.as_wtf8(), value, vm).unwrap(); } if !slots.is_empty() { @@ -385,11 +388,11 @@ impl PyBaseObject { class .__qualname__(vm) .downcast_ref::() - .map(|n| n.as_str()), + .map(|n| n.as_wtf8()), class .__module__(vm) .downcast_ref::() - .map(|m| m.as_str()), + .map(|m| m.as_wtf8()), ) { (None, _) => Err(vm.new_type_error("Unknown qualified name")), (Some(qualname), Some(module)) if module != "builtins" => Ok(PyStr::from(format!( @@ -473,7 +476,7 @@ impl PyBaseObject { a.len() == b.len() && a.iter() .zip(b.iter()) - .all(|(x, y)| x.as_str() == y.as_str()) + .all(|(x, y)| x.as_wtf8() == y.as_wtf8()) } (None, None) => true, _ => false, diff --git a/crates/vm/src/builtins/range.rs b/crates/vm/src/builtins/range.rs index 92de2463e2c..e42cf5f23f6 100644 --- a/crates/vm/src/builtins/range.rs +++ b/crates/vm/src/builtins/range.rs @@ -52,8 +52,9 @@ fn iter_search( SearchType::Index => Err(vm.new_value_error(format!( "{} not in range", item.repr(vm) - .map(|v| v.as_str().to_owned()) - .unwrap_or_else(|_| "value".to_owned()) + .as_ref() + .map_or("value".as_ref(), |s| s.as_wtf8()) + .to_owned() ))), } } diff --git a/crates/vm/src/builtins/set.rs b/crates/vm/src/builtins/set.rs index 17f406bbcbe..7e2c10ac8f0 100644 --- a/crates/vm/src/builtins/set.rs +++ b/crates/vm/src/builtins/set.rs @@ -10,7 +10,7 @@ use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, atomic_func, class::PyClassImpl, - common::{ascii, hash::PyHash, lock::PyMutex, rc::PyRc}, + common::{ascii, hash::PyHash, lock::PyMutex, rc::PyRc, wtf8::Wtf8Buf}, convert::ToPyResult, dict_inner::{self, DictSize}, function::{ArgIterable, FuncArgs, OptionalArg, PosArgs, PyArithmeticValue, PyComparisonValue}, @@ -318,7 +318,7 @@ impl PySetInner { } } - fn repr(&self, class_name: Option<&str>, vm: &VirtualMachine) -> PyResult { + fn repr(&self, class_name: Option<&str>, vm: &VirtualMachine) -> PyResult { collection_repr(class_name, "{", "}", self.elements().iter(), vm) } @@ -942,23 +942,23 @@ impl AsNumber for PySet { impl Representable for PySet { #[inline] - fn repr_str(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { let class = zelf.class(); let borrowed_name = class.name(); let class_name = borrowed_name.deref(); - let s = if zelf.inner.len() == 0 { - format!("{class_name}()") - } else if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + if zelf.inner.len() == 0 { + return Ok(Wtf8Buf::from(format!("{class_name}()"))); + } + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { let name = if class_name != "set" { Some(class_name) } else { None }; - zelf.inner.repr(name, vm)? + zelf.inner.repr(name, vm) } else { - format!("{class_name}(...)") - }; - Ok(s) + Ok(Wtf8Buf::from(format!("{class_name}(...)"))) + } } } @@ -1289,18 +1289,18 @@ impl AsNumber for PyFrozenSet { impl Representable for PyFrozenSet { #[inline] - fn repr_str(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { let inner = &zelf.inner; let class = zelf.class(); let class_name = class.name(); - let s = if inner.len() == 0 { - format!("{class_name}()") - } else if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - inner.repr(Some(&class_name), vm)? + if inner.len() == 0 { + return Ok(Wtf8Buf::from(format!("{class_name}()"))); + } + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + inner.repr(Some(&class_name), vm) } else { - format!("{class_name}(...)") - }; - Ok(s) + Ok(Wtf8Buf::from(format!("{class_name}(...)"))) + } } } diff --git a/crates/vm/src/builtins/slice.rs b/crates/vm/src/builtins/slice.rs index 3aa23b0746c..24c87aa1c11 100644 --- a/crates/vm/src/builtins/slice.rs +++ b/crates/vm/src/builtins/slice.rs @@ -1,5 +1,7 @@ // sliceobject.{h,c} in CPython // spell-checker:ignore sliceobject +use rustpython_common::wtf8::{Wtf8Buf, wtf8_concat}; + use super::{PyGenericAlias, PyStrRef, PyTupleRef, PyType, PyTypeRef}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, @@ -291,12 +293,20 @@ impl Comparable for PySlice { impl Representable for PySlice { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let start_repr = zelf.start_ref(vm).repr(vm)?; let stop_repr = zelf.stop.repr(vm)?; let step_repr = zelf.step_ref(vm).repr(vm)?; - Ok(format!("slice({start_repr}, {stop_repr}, {step_repr})")) + Ok(wtf8_concat!( + "slice(", + start_repr.as_wtf8(), + ", ", + stop_repr.as_wtf8(), + ", ", + step_repr.as_wtf8(), + ")" + )) } } diff --git a/crates/vm/src/builtins/staticmethod.rs b/crates/vm/src/builtins/staticmethod.rs index ac363415a9f..fe0cc0a14e2 100644 --- a/crates/vm/src/builtins/staticmethod.rs +++ b/crates/vm/src/builtins/staticmethod.rs @@ -182,11 +182,11 @@ impl Representable for PyStaticMethod { class .__qualname__(vm) .downcast_ref::() - .map(|n| n.as_str()), + .map(|n| n.as_wtf8()), class .__module__(vm) .downcast_ref::() - .map(|m| m.as_str()), + .map(|m| m.as_wtf8()), ) { (None, _) => Err(vm.new_type_error("Unknown qualified name")), (Some(qualname), Some(module)) if module != "builtins" => { diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index a1f0de1810f..576815e3b4f 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -41,7 +41,7 @@ use rustpython_common::{ hash, lock::PyMutex, str::DeduceStrKind, - wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk}, + wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; use unic_ucd_bidi::BidiClass; use unic_ucd_category::GeneralCategory; @@ -50,13 +50,13 @@ use unicode_casing::CharExt; impl<'a> TryFromBorrowedObject<'a> for String { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { - obj.try_value_with(|pystr: &PyStr| Ok(pystr.as_str().to_owned()), vm) + obj.try_value_with(|pystr: &PyUtf8Str| Ok(pystr.as_str().to_owned()), vm) } } impl<'a> TryFromBorrowedObject<'a> for &'a str { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { - let pystr: &Py = TryFromBorrowedObject::try_from_borrowed_object(vm, obj)?; + let pystr: &Py = TryFromBorrowedObject::try_from_borrowed_object(vm, obj)?; Ok(pystr.as_str()) } } @@ -90,21 +90,21 @@ impl fmt::Debug for PyStr { impl AsRef for PyStr { #[track_caller] // <- can remove this once it doesn't panic fn as_ref(&self) -> &str { - self.as_str() + self.to_str().expect("str has surrogates") } } impl AsRef for Py { #[track_caller] // <- can remove this once it doesn't panic fn as_ref(&self) -> &str { - self.as_str() + self.to_str().expect("str has surrogates") } } impl AsRef for PyStrRef { #[track_caller] // <- can remove this once it doesn't panic fn as_ref(&self) -> &str { - self.as_str() + self.to_str().expect("str has surrogates") } } @@ -126,6 +126,20 @@ impl AsRef for PyStrRef { } } +impl Wtf8Concat for PyStr { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_wtf8(self.as_wtf8()); + } +} + +impl Wtf8Concat for Py { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_wtf8(self.as_wtf8()); + } +} + impl<'a> From<&'a AsciiStr> for PyStr { fn from(s: &'a AsciiStr) -> Self { s.to_owned().into() @@ -249,6 +263,13 @@ impl<'a> AsPyStr<'a> for &'a Py { } } +impl<'a> AsPyStr<'a> for &'a Py { + #[inline] + fn as_pystr(self, _ctx: &Context) -> &'a Py { + Py::::as_pystr(self) + } +} + impl<'a> AsPyStr<'a> for &'a PyStrRef { #[inline] fn as_pystr(self, _ctx: &Context) -> &'a Py { @@ -256,6 +277,13 @@ impl<'a> AsPyStr<'a> for &'a PyStrRef { } } +impl<'a> AsPyStr<'a> for &'a PyUtf8StrRef { + #[inline] + fn as_pystr(self, _ctx: &Context) -> &'a Py { + Py::::as_pystr(self) + } +} + impl AsPyStr<'static> for &'static str { #[inline] fn as_pystr(self, ctx: &Context) -> &'static Py { @@ -270,6 +298,13 @@ impl<'a> AsPyStr<'a> for &'a PyStrInterned { } } +impl<'a> AsPyStr<'a> for &'a PyUtf8StrInterned { + #[inline] + fn as_pystr(self, _ctx: &Context) -> &'a Py { + Py::::as_pystr(self) + } +} + #[pyclass(module = false, name = "str_iterator", traverse = "manual")] #[derive(Debug)] pub struct PyStrIterator { @@ -350,9 +385,9 @@ pub struct StrArgs { #[pyarg(any, optional)] object: OptionalArg, #[pyarg(any, optional)] - encoding: OptionalArg, + encoding: OptionalArg, #[pyarg(any, optional)] - errors: OptionalArg, + errors: OptionalArg, } impl Constructor for PyStr { @@ -440,17 +475,20 @@ impl PyStr { self.data.as_wtf8().as_bytes() } - // FIXME: make this return an Option - #[inline] - #[track_caller] // <- can remove this once it doesn't panic - pub fn as_str(&self) -> &str { - self.data.as_str().expect("str has surrogates") - } - pub fn to_str(&self) -> Option<&str> { self.data.as_str() } + /// Returns `&str` + /// + /// # Panic + /// If the string contains surrogates. + #[inline] + #[track_caller] + pub fn expect_str(&self) -> &str { + self.to_str().expect("PyStr contains surrogates") + } + pub(crate) fn ensure_valid_utf8(&self, vm: &VirtualMachine) -> PyResult<()> { if self.is_utf8() { Ok(()) @@ -660,14 +698,7 @@ impl PyStr { match self.as_str_kind() { PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(), PyKindStr::Utf8(s) => s.to_lowercase().into(), - PyKindStr::Wtf8(w) => w - .chunks() - .map(|c| match c { - Wtf8Chunk::Utf8(s) => s.to_lowercase().into(), - Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), - }) - .collect::() - .into(), + PyKindStr::Wtf8(w) => w.to_lowercase().into(), } } @@ -693,14 +724,7 @@ impl PyStr { match self.as_str_kind() { PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(), PyKindStr::Utf8(s) => s.to_uppercase().into(), - PyKindStr::Wtf8(w) => w - .chunks() - .map(|c| match c { - Wtf8Chunk::Utf8(s) => s.to_uppercase().into(), - Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), - }) - .collect::() - .into(), + PyKindStr::Wtf8(w) => w.to_uppercase().into(), } } @@ -732,12 +756,7 @@ impl PyStr { Some(ch) => out.extend(ch.to_titlecase()), None => out.push(ch), } - for chunk in chars.as_wtf8().chunks() { - match chunk { - Wtf8Chunk::Utf8(s) => out.push_str(&s.to_lowercase()), - Wtf8Chunk::Surrogate(ch) => out.push(ch), - } - } + out.push_wtf8(&chars.as_wtf8().to_lowercase()); } out } @@ -972,10 +991,9 @@ impl PyStr { #[pymethod] fn __format__( zelf: PyRef, - spec: PyStrRef, + spec: PyUtf8StrRef, vm: &VirtualMachine, ) -> PyResult> { - let spec = spec.as_str(); if spec.is_empty() { return if zelf.class().is(vm.ctx.types.str_type) { Ok(zelf) @@ -984,7 +1002,7 @@ impl PyStr { }; } let zelf = zelf.try_into_utf8(vm)?; - let s = FormatSpec::parse(spec) + let s = FormatSpec::parse(spec.as_str()) .and_then(|format_spec| { format_spec.format_string(&CharLenStr(zelf.as_str(), zelf.char_len())) }) @@ -1345,33 +1363,34 @@ impl PyStr { // https://docs.python.org/3/library/stdtypes.html#str.translate #[pymethod] - fn translate(&self, table: PyObjectRef, vm: &VirtualMachine) -> PyResult { + fn translate(&self, table: PyObjectRef, vm: &VirtualMachine) -> PyResult { vm.get_method_or_type_error(table.clone(), identifier!(vm, __getitem__), || { format!("'{}' object is not subscriptable", table.class().name()) })?; - let mut translated = String::new(); - for c in self.as_str().chars() { - match table.get_item(&*(c as u32).to_pyobject(vm), vm) { + let mut translated = Wtf8Buf::new(); + for cp in self.as_wtf8().code_points() { + match table.get_item(&*cp.to_u32().to_pyobject(vm), vm) { Ok(value) => { if let Some(text) = value.downcast_ref::() { - translated.push_str(text.as_str()); + translated.push_wtf8(text.as_wtf8()); } else if let Some(bigint) = value.downcast_ref::() { - let ch = bigint + let mapped = bigint .as_bigint() .to_u32() - .and_then(core::char::from_u32) + .and_then(CodePoint::from_u32) .ok_or_else(|| { vm.new_value_error("character mapping must be in range(0x110000)") })?; - translated.push(ch); + translated.push(mapped); } else if !vm.is_none(&value) { return Err( vm.new_type_error("character mapping must return integer, None or str") ); } } - _ => translated.push(c), + Err(e) if e.fast_isinstance(vm.ctx.exceptions.key_error) => translated.push(cp), + Err(e) => return Err(e), } } Ok(translated) @@ -1389,16 +1408,20 @@ impl PyStr { match dict_or_str.downcast::() { Ok(from_str) => { if to_str.len() == from_str.len() { - for (c1, c2) in from_str.as_str().chars().zip(to_str.as_str().chars()) { + for (c1, c2) in from_str + .as_wtf8() + .code_points() + .zip(to_str.as_wtf8().code_points()) + { new_dict.set_item( - &*vm.new_pyobj(c1 as u32), - vm.new_pyobj(c2 as u32), + &*vm.new_pyobj(c1.to_u32()), + vm.new_pyobj(c2.to_u32()), vm, )?; } if let OptionalArg::Present(none_str) = none_str { - for c in none_str.as_str().chars() { - new_dict.set_item(&*vm.new_pyobj(c as u32), vm.ctx.none(), vm)?; + for c in none_str.as_wtf8().code_points() { + new_dict.set_item(&*vm.new_pyobj(c.to_u32()), vm.ctx.none(), vm)?; } } Ok(new_dict.to_pyobject(vm)) @@ -1426,7 +1449,8 @@ impl PyStr { )?; } else if let Some(string) = key.downcast_ref::() { if string.len() == 1 { - let num_value = string.as_str().chars().next().unwrap() as u32; + let num_value = + string.as_wtf8().code_points().next().unwrap().to_u32(); new_dict.set_item(&*num_value.to_pyobject(vm), val, vm)?; } else { return Err(vm.new_value_error( @@ -1455,7 +1479,7 @@ impl PyStr { #[pymethod] fn __getnewargs__(zelf: PyRef, vm: &VirtualMachine) -> PyObjectRef { - (zelf.as_str(),).to_pyobject(vm) + (zelf.as_wtf8(),).to_pyobject(vm) } #[pymethod] @@ -1601,20 +1625,21 @@ impl AsSequence for PyStr { #[derive(FromArgs)] struct EncodeArgs { #[pyarg(any, default)] - encoding: Option, + encoding: Option, #[pyarg(any, default)] - errors: Option, + errors: Option, } pub(crate) fn encode_string( s: PyStrRef, - encoding: Option, - errors: Option, + encoding: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { - let encoding = encoding - .as_ref() - .map_or(crate::codecs::DEFAULT_ENCODING, |s| s.as_str()); + let encoding = match encoding.as_ref() { + None => crate::codecs::DEFAULT_ENCODING, + Some(s) => s.as_str(), + }; vm.state.codec_registry.encode_text(s, encoding, errors, vm) } @@ -1868,14 +1893,16 @@ impl SliceableSequenceOp for PyStr { } impl AsRef for PyRefExact { + #[track_caller] fn as_ref(&self) -> &str { - self.as_str() + self.to_str().expect("str has surrogates") } } impl AsRef for PyExact { + #[track_caller] fn as_ref(&self) -> &str { - self.as_str() + self.to_str().expect("str has surrogates") } } @@ -2033,7 +2060,7 @@ impl AsRef for PyUtf8Str { impl AsRef for PyUtf8Str { #[inline] fn as_ref(&self) -> &str { - self.0.as_str() + self.as_str() } } @@ -2045,6 +2072,12 @@ impl PyUtf8Str { Self(PyStr::from(data)) } + /// Returns the underlying WTF-8 slice (always valid UTF-8 for this type). + #[inline] + pub fn as_wtf8(&self) -> &Wtf8 { + self.0.as_wtf8() + } + /// Returns the underlying string slice. pub fn as_str(&self) -> &str { debug_assert!( @@ -2055,6 +2088,11 @@ impl PyUtf8Str { unsafe { self.0.to_str().unwrap_unchecked() } } + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.as_str().as_bytes() + } + #[inline] pub fn byte_len(&self) -> usize { self.0.byte_len() @@ -2079,6 +2117,29 @@ impl Py { &*(self as *const Self as *const Py) } } + + /// Returns the underlying `&str`. + #[inline] + pub fn as_str(&self) -> &str { + self.as_pystr().to_str().unwrap_or_else(|| { + debug_assert!(false, "PyUtf8Str invariant violated"); + // Safety: PyUtf8Str guarantees valid UTF-8 + unsafe { core::hint::unreachable_unchecked() } + }) + } +} + +impl PyRef { + /// Convert to PyStrRef. Safe because PyUtf8Str is a subtype of PyStr. + pub fn into_wtf8(self) -> PyStrRef { + unsafe { mem::transmute::(self) } + } +} + +impl From> for PyRef { + fn from(s: PyRef) -> Self { + s.into_wtf8() + } } impl PartialEq for PyUtf8Str { @@ -2454,10 +2515,54 @@ impl AsRef for PyStrInterned { } } +/// Interned PyUtf8Str — guaranteed UTF-8 at type level. +/// Same layout as `PyStrInterned` due to `#[repr(transparent)]` on both +/// `PyInterned` and `PyUtf8Str`. +pub type PyUtf8StrInterned = PyInterned; + +impl PyUtf8StrInterned { + /// Returns the underlying `&str`. + #[inline] + pub fn as_str(&self) -> &str { + Py::::as_str(self) + } + + /// View as `PyStrInterned` (widening: UTF-8 → WTF-8). + #[inline] + pub fn as_interned_str(&self) -> &PyStrInterned { + // Safety: PyUtf8Str is #[repr(transparent)] over PyStr, + // so PyInterned has the same layout as PyInterned. + unsafe { &*(self as *const Self as *const PyStrInterned) } + } + + /// Narrow a `PyStrInterned` to `PyUtf8StrInterned`. + /// + /// # Safety + /// The caller must ensure that the interned string is valid UTF-8. + #[inline] + pub unsafe fn from_str_interned_unchecked(s: &PyStrInterned) -> &Self { + unsafe { &*(s as *const PyStrInterned as *const Self) } + } +} + +impl core::fmt::Display for PyUtf8StrInterned { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(self.as_str()) + } +} + +impl AsRef for PyUtf8StrInterned { + #[inline(always)] + fn as_ref(&self) -> &str { + self.as_str() + } +} + #[cfg(test)] mod tests { use super::*; use crate::Interpreter; + use rustpython_common::wtf8::Wtf8Buf; #[test] fn str_title() { @@ -2525,7 +2630,7 @@ mod tests { .unwrap(); let text = PyStr::from("abc"); let translated = text.translate(translated, vm).unwrap(); - assert_eq!(translated, "🎅xda".to_owned()); + assert_eq!(translated, Wtf8Buf::from("🎅xda")); let translated = text.translate(vm.ctx.new_int(3).into(), vm); assert_eq!("TypeError", &*translated.unwrap_err().class().name(),); }) diff --git a/crates/vm/src/builtins/template.rs b/crates/vm/src/builtins/template.rs index 0496504cbef..e9dd1e7adfb 100644 --- a/crates/vm/src/builtins/template.rs +++ b/crates/vm/src/builtins/template.rs @@ -1,4 +1,7 @@ -use super::{PyStr, PyTupleRef, PyType, PyTypeRef, genericalias::PyGenericAlias}; +use super::{ + PyStr, PyTupleRef, PyType, PyTypeRef, genericalias::PyGenericAlias, + interpolation::PyInterpolation, +}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, atomic_func, @@ -11,8 +14,7 @@ use crate::{ SelfIter, }, }; - -use super::interpolation::PyInterpolation; +use rustpython_common::wtf8::{Wtf8Buf, wtf8_concat}; /// Template object for t-strings (PEP 750). /// @@ -58,8 +60,9 @@ impl Constructor for PyTemplate { // Concatenate adjacent strings if let Some(last) = strings.last_mut() { let last_str = last.downcast_ref::().unwrap(); - let concatenated = format!("{}{}", last_str.as_str(), s.as_str()); - *last = vm.ctx.new_str(concatenated).into(); + let mut buf = last_str.as_wtf8().to_owned(); + buf.push_wtf8(s.as_wtf8()); + *last = vm.ctx.new_str(buf).into(); } } else { strings.push(s.into()); @@ -143,18 +146,22 @@ impl PyTemplate { } // Concatenate last string of self with first string of other - let last_self = self + let mut buf = Wtf8Buf::new(); + if let Some(s) = self .strings .get(self_strings_len.saturating_sub(1)) - .and_then(|s| s.downcast_ref::().map(|s| s.as_str().to_owned())) - .unwrap_or_default(); - let first_other = other + .and_then(|s| s.downcast_ref::()) + { + buf.push_wtf8(s.as_wtf8()); + } + if let Some(s) = other .strings .first() - .and_then(|s| s.downcast_ref::().map(|s| s.as_str().to_owned())) - .unwrap_or_default(); - let concatenated = format!("{}{}", last_self, first_other); - new_strings.push(vm.ctx.new_str(concatenated).into()); + .and_then(|s| s.downcast_ref::()) + { + buf.push_wtf8(s.as_wtf8()); + } + new_strings.push(vm.ctx.new_str(buf).into()); // Add remaining strings from other (skip first) for i in 1..other.strings.len() { @@ -243,31 +250,16 @@ impl Iterable for PyTemplate { impl Representable for PyTemplate { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let mut parts = Vec::new(); - - let strings_len = zelf.strings.len(); - let interps_len = zelf.interpolations.len(); - - for i in 0..strings_len.max(interps_len * 2 + 1) { - if i % 2 == 0 { - // String position - let idx = i / 2; - if idx < strings_len { - let s = zelf.strings.get(idx).unwrap(); - parts.push(s.repr(vm)?.as_str().to_owned()); - } - } else { - // Interpolation position - let idx = i / 2; - if idx < interps_len { - let interp = zelf.interpolations.get(idx).unwrap(); - parts.push(interp.repr(vm)?.as_str().to_owned()); - } - } - } - - Ok(format!("Template({})", parts.join(", "))) + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let strings_repr = zelf.strings.as_object().repr(vm)?; + let interp_repr = zelf.interpolations.as_object().repr(vm)?; + Ok(wtf8_concat!( + "Template(strings=", + strings_repr.as_wtf8(), + ", interpolations=", + interp_repr.as_wtf8(), + ')', + )) } } @@ -317,7 +309,7 @@ impl IterNext for PyTemplateIter { // Skip empty strings if let Some(s) = item.downcast_ref::() - && s.as_str().is_empty() + && s.as_wtf8().is_empty() { continue; } diff --git a/crates/vm/src/builtins/tuple.rs b/crates/vm/src/builtins/tuple.rs index 67c51127cb4..e0f88f08d22 100644 --- a/crates/vm/src/builtins/tuple.rs +++ b/crates/vm/src/builtins/tuple.rs @@ -5,6 +5,7 @@ use crate::common::lock::LazyLock; use crate::common::{ hash::{PyHash, PyUHash}, lock::PyMutex, + wtf8::wtf8_concat, }; use crate::object::{Traverse, TraverseFn}; use crate::{ @@ -481,7 +482,7 @@ impl Representable for PyTuple { vm.ctx.intern_str("()").to_owned() } else if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { let s = if zelf.len() == 1 { - format!("({},)", zelf.elements[0].repr(vm)?) + wtf8_concat!("(", zelf.elements[0].repr(vm)?.as_wtf8(), ",)") } else { collection_repr(None, "(", ")", zelf.elements.iter(), vm)? }; diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 9fcc27fae11..86865e9e083 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -1,6 +1,6 @@ use super::{ PyClassMethod, PyDictRef, PyList, PyStaticMethod, PyStr, PyStrInterned, PyStrRef, PyTupleRef, - PyWeak, mappingproxy::PyMappingProxy, object, union_, + PyUtf8StrRef, PyWeak, mappingproxy::PyMappingProxy, object, union_, }; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, @@ -32,6 +32,7 @@ use core::{any::Any, borrow::Borrow, ops::Deref, pin::Pin, ptr::NonNull}; use indexmap::{IndexMap, map::Entry}; use itertools::Itertools; use num_traits::ToPrimitive; +use rustpython_common::wtf8::Wtf8; use std::collections::HashSet; #[pyclass(module = false, name = "type", traverse = "manual")] @@ -88,7 +89,7 @@ unsafe impl crate::object::Traverse for PyType { // PyHeapTypeObject in CPython pub struct HeapTypeExt { - pub name: PyRwLock, + pub name: PyRwLock, pub qualname: PyRwLock, pub slots: Option>>, pub type_data: PyRwLock>, @@ -215,9 +216,10 @@ impl PyType { // Set HEAPTYPE flag for heap-allocated types slots.flags |= PyTypeFlags::HEAPTYPE; - let name = ctx.new_str(name); + let name_utf8 = ctx.new_utf8_str(name); + let name = name_utf8.clone().into_wtf8(); let heaptype_ext = HeapTypeExt { - name: PyRwLock::new(name.clone()), + name: PyRwLock::new(name_utf8), qualname: PyRwLock::new(name), slots: None, type_data: PyRwLock::new(None), @@ -619,14 +621,24 @@ impl PyType { pub fn slot_name(&self) -> BorrowedValue<'_, str> { self.name_inner( |name| name.into(), - |ext| PyRwLockReadGuard::map(ext.name.read(), |name| name.as_str()).into(), + |ext| { + PyRwLockReadGuard::map(ext.name.read(), |name: &PyUtf8StrRef| -> &str { + name.as_str() + }) + .into() + }, ) } pub fn name(&self) -> BorrowedValue<'_, str> { self.name_inner( |name| name.rsplit_once('.').map_or(name, |(_, name)| name).into(), - |ext| PyRwLockReadGuard::map(ext.name.read(), |name| name.as_str()).into(), + |ext| { + PyRwLockReadGuard::map(ext.name.read(), |name: &PyUtf8StrRef| -> &str { + name.as_str() + }) + .into() + }, ) } @@ -837,7 +849,7 @@ impl PyType { }) .to_owned() }, - |ext| ext.name.read().clone(), + |ext| ext.name.read().clone().into_wtf8(), ) } @@ -1158,7 +1170,7 @@ impl PyType { if name.as_bytes().contains(&0) { return Err(vm.new_value_error("type name must not contain null characters")); } - name.ensure_valid_utf8(vm)?; + let name = name.try_into_utf8(vm)?; let heap_type = self.heaptype_ext.as_ref().ok_or_else(|| { vm.new_type_error(format!( @@ -1257,7 +1269,7 @@ impl Constructor for PyType { if name.as_bytes().contains(&0) { return Err(vm.new_value_error("type name must not contain null characters")); } - name.ensure_valid_utf8(vm)?; + let name = name.try_into_utf8(vm)?; let (metatype, base, bases, base_is_type) = if bases.is_empty() { let base = vm.ctx.types.object_type.to_owned(); @@ -1306,7 +1318,7 @@ impl Constructor for PyType { .transpose()? .unwrap_or_else(|| { // If __qualname__ is not provided, we can use the name as default - name.clone() + name.clone().into_wtf8() }); let mut attributes = dict.to_attributes(vm); @@ -1355,144 +1367,141 @@ impl Constructor for PyType { attributes.insert(identifier!(vm, __hash__), vm.ctx.none.clone().into()); } - let (heaptype_slots, add_dict): (Option>>, bool) = if let Some(x) = - attributes.get(identifier!(vm, __slots__)) - { - // Check if __slots__ is bytes - not allowed - if x.class().is(vm.ctx.types.bytes_type) { - return Err( - vm.new_type_error("__slots__ items must be strings, not 'bytes'".to_owned()) - ); - } + let (heaptype_slots, add_dict): (Option>>, bool) = + if let Some(x) = attributes.get(identifier!(vm, __slots__)) { + // Check if __slots__ is bytes - not allowed + if x.class().is(vm.ctx.types.bytes_type) { + return Err(vm.new_type_error("__slots__ items must be strings, not 'bytes'")); + } - let slots = if x.class().is(vm.ctx.types.str_type) { - let x = unsafe { x.downcast_unchecked_ref::() }; - PyTuple::new_ref_typed(vec![x.to_owned()], &vm.ctx) - } else { - let iter = x.get_iter(vm)?; - let elements = { - let mut elements = Vec::new(); - while let PyIterReturn::Return(element) = iter.next(vm)? { - // Check if any slot item is bytes - if element.class().is(vm.ctx.types.bytes_type) { - return Err(vm.new_type_error( - "__slots__ items must be strings, not 'bytes'".to_owned(), - )); + let slots = if x.class().is(vm.ctx.types.str_type) { + let x = unsafe { x.downcast_unchecked_ref::() }; + PyTuple::new_ref_typed(vec![x.to_owned()], &vm.ctx) + } else { + let iter = x.get_iter(vm)?; + let elements = { + let mut elements = Vec::new(); + while let PyIterReturn::Return(element) = iter.next(vm)? { + // Check if any slot item is bytes + if element.class().is(vm.ctx.types.bytes_type) { + return Err(vm.new_type_error( + "__slots__ items must be strings, not 'bytes'", + )); + } + elements.push(element); } - elements.push(element); - } - elements + elements + }; + let tuple = elements.into_pytuple(vm); + tuple.try_into_typed(vm)? }; - let tuple = elements.into_pytuple(vm); - tuple.try_into_typed(vm)? - }; - - // Check if base has itemsize > 0 - can't add arbitrary slots to variable-size types - // Types like int, bytes, tuple have itemsize > 0 and don't allow custom slots - // But types like weakref.ref have itemsize = 0 and DO allow slots - let has_custom_slots = slots - .iter() - .any(|s| s.as_str() != "__dict__" && s.as_str() != "__weakref__"); - if has_custom_slots && base.slots.itemsize > 0 { - return Err(vm.new_type_error(format!( - "nonempty __slots__ not supported for subtype of '{}'", - base.name() - ))); - } - // Validate slot names and track duplicates - let mut seen_dict = false; - let mut seen_weakref = false; - for slot in slots.iter() { - // Use isidentifier for validation (handles Unicode properly) - if !slot.isidentifier() { - return Err(vm.new_type_error("__slots__ must be identifiers".to_owned())); + // Check if base has itemsize > 0 - can't add arbitrary slots to variable-size types + // Types like int, bytes, tuple have itemsize > 0 and don't allow custom slots + // But types like weakref.ref have itemsize = 0 and DO allow slots + let has_custom_slots = slots + .iter() + .any(|s| !matches!(s.as_bytes(), b"__dict__" | b"__weakref__")); + if has_custom_slots && base.slots.itemsize > 0 { + return Err(vm.new_type_error(format!( + "nonempty __slots__ not supported for subtype of '{}'", + base.name() + ))); } - let slot_name = slot.as_str(); + // Validate slot names and track duplicates + let mut seen_dict = false; + let mut seen_weakref = false; + for slot in slots.iter() { + // Use isidentifier for validation (handles Unicode properly) + if !slot.isidentifier() { + return Err(vm.new_type_error("__slots__ must be identifiers")); + } + + let slot_name = slot.as_bytes(); - // Check for duplicate __dict__ - if slot_name == "__dict__" { - if seen_dict { - return Err(vm.new_type_error( - "__dict__ slot disallowed: we already got one".to_owned(), - )); + // Check for duplicate __dict__ + if slot_name == b"__dict__" { + if seen_dict { + return Err( + vm.new_type_error("__dict__ slot disallowed: we already got one") + ); + } + seen_dict = true; } - seen_dict = true; - } - // Check for duplicate __weakref__ - if slot_name == "__weakref__" { - if seen_weakref { - return Err(vm.new_type_error( - "__weakref__ slot disallowed: we already got one".to_owned(), - )); + // Check for duplicate __weakref__ + if slot_name == b"__weakref__" { + if seen_weakref { + return Err(vm.new_type_error( + "__weakref__ slot disallowed: we already got one", + )); + } + seen_weakref = true; } - seen_weakref = true; - } - // Check if slot name conflicts with class attributes - if attributes.contains_key(vm.ctx.intern_str(slot_name)) { - return Err(vm.new_value_error(format!( - "'{}' in __slots__ conflicts with a class variable", - slot_name - ))); + // Check if slot name conflicts with class attributes + if attributes.contains_key(vm.ctx.intern_str(slot.as_wtf8())) { + return Err(vm.new_value_error(format!( + "'{}' in __slots__ conflicts with a class variable", + slot.as_wtf8() + ))); + } } - } - // Check if base class already has __dict__ - can't redefine it - if seen_dict && base.slots.flags.has_feature(PyTypeFlags::HAS_DICT) { - return Err( - vm.new_type_error("__dict__ slot disallowed: we already got one".to_owned()) - ); - } + // Check if base class already has __dict__ - can't redefine it + if seen_dict && base.slots.flags.has_feature(PyTypeFlags::HAS_DICT) { + return Err(vm.new_type_error("__dict__ slot disallowed: we already got one")); + } - // Check if base class already has __weakref__ - can't redefine it - // A base has weakref support if: - // 1. It's a heap type without explicit __slots__ (automatic weakref), OR - // 2. It's a heap type with __weakref__ in its __slots__ - if seen_weakref { - let base_has_weakref = if let Some(ref ext) = base.heaptype_ext { - match &ext.slots { - // Heap type without __slots__ - has automatic weakref - None => true, - // Heap type with __slots__ - check if __weakref__ is in slots - Some(base_slots) => base_slots.iter().any(|s| s.as_str() == "__weakref__"), + // Check if base class already has __weakref__ - can't redefine it + // A base has weakref support if: + // 1. It's a heap type without explicit __slots__ (automatic weakref), OR + // 2. It's a heap type with __weakref__ in its __slots__ + if seen_weakref { + let base_has_weakref = if let Some(ref ext) = base.heaptype_ext { + match &ext.slots { + // Heap type without __slots__ - has automatic weakref + None => true, + // Heap type with __slots__ - check if __weakref__ is in slots + Some(base_slots) => { + base_slots.iter().any(|s| s.as_bytes() == b"__weakref__") + } + } + } else { + // Builtin type - check if it has __weakref__ descriptor + let weakref_name = vm.ctx.intern_str("__weakref__"); + base.attributes.read().contains_key(weakref_name) + }; + + if base_has_weakref { + return Err( + vm.new_type_error("__weakref__ slot disallowed: we already got one") + ); } - } else { - // Builtin type - check if it has __weakref__ descriptor - let weakref_name = vm.ctx.intern_str("__weakref__"); - base.attributes.read().contains_key(weakref_name) - }; - - if base_has_weakref { - return Err(vm.new_type_error( - "__weakref__ slot disallowed: we already got one".to_owned(), - )); } - } - // Check if __dict__ is in slots - let dict_name = "__dict__"; - let has_dict = slots.iter().any(|s| s.as_str() == dict_name); + // Check if __dict__ is in slots + let dict_name = "__dict__"; + let has_dict = slots.iter().any(|s| s.as_wtf8() == dict_name); + + // Filter out __dict__ from slots + let filtered_slots = if has_dict { + let filtered: Vec = slots + .iter() + .filter(|s| s.as_wtf8() != dict_name) + .cloned() + .collect(); + PyTuple::new_ref_typed(filtered, &vm.ctx) + } else { + slots + }; - // Filter out __dict__ from slots - let filtered_slots = if has_dict { - let filtered: Vec = slots - .iter() - .filter(|s| s.as_str() != dict_name) - .cloned() - .collect(); - PyTuple::new_ref_typed(filtered, &vm.ctx) + (Some(filtered_slots), has_dict) } else { - slots + (None, false) }; - (Some(filtered_slots), has_dict) - } else { - (None, false) - }; - // FIXME: this is a temporary fix. multi bases with multiple slots will break object let base_member_count = bases .iter() @@ -1548,7 +1557,10 @@ impl Constructor for PyType { let class_name = typ.name().to_string(); for member in slots.as_slice() { // Apply name mangling for private attributes (__x -> _ClassName__x) - let mangled_name = mangle_name(&class_name, member.as_str()); + let member_str = member.to_str().ok_or_else(|| { + vm.new_type_error("__slots__ must be valid UTF-8 strings".to_owned()) + })?; + let mangled_name = mangle_name(&class_name, member_str); let member_def = PyMemberDef { name: mangled_name.clone(), kind: MemberKind::ObjectEx, @@ -1720,7 +1732,7 @@ impl GetAttr for PyType { #[cold] fn attribute_error( zelf: &Py, - name: &str, + name: &Wtf8, vm: &VirtualMachine, ) -> PyBaseExceptionRef { vm.new_attribute_error(format!( @@ -1731,7 +1743,7 @@ impl GetAttr for PyType { } let Some(name) = vm.ctx.interned_str(name_str) else { - return Err(attribute_error(zelf, name_str.as_str(), vm)); + return Err(attribute_error(zelf, name_str.as_wtf8(), vm)); }; vm_trace!("type.__getattribute__({:?}, {:?})", zelf, name); let mcl = zelf.class(); @@ -1761,7 +1773,7 @@ impl GetAttr for PyType { } else if let Some(attr) = mcl_attr { vm.call_if_get_descriptor(&attr, zelf.to_owned().into()) } else { - Err(attribute_error(zelf, name_str.as_str(), vm)) + Err(attribute_error(zelf, name_str.as_wtf8(), vm)) } } } @@ -1863,8 +1875,7 @@ impl SetAttr for PyType { value: PySetterValue, vm: &VirtualMachine, ) -> PyResult<()> { - // TODO: pass PyRefExact instead of &str - let attr_name = vm.ctx.intern_str(attr_name.as_str()); + let attr_name = vm.ctx.intern_str(attr_name.as_wtf8()); if zelf.slots.flags.has_feature(PyTypeFlags::IMMUTABLETYPE) { return Err(vm.new_type_error(format!( "cannot set '{}' attribute of immutable type '{}'", @@ -1888,11 +1899,11 @@ impl SetAttr for PyType { return Err(vm.new_attribute_error(format!( "type object '{}' has no attribute '{}'", zelf.name(), - attr_name.as_str(), + attr_name, ))); } } - if attr_name.as_str().starts_with("__") && attr_name.as_str().ends_with("__") { + if attr_name.as_wtf8().starts_with("__") && attr_name.as_wtf8().ends_with("__") { if assign { zelf.update_slot::(attr_name, &vm.ctx); } else { @@ -1949,19 +1960,15 @@ impl Representable for PyType { #[inline] fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { let module = zelf.__module__(vm); - let module = module.downcast_ref::().map(|m| m.as_str()); + let module = module.downcast_ref::().map(|m| m.as_wtf8()); let repr = match module { Some(module) if module != "builtins" => { + let qualname = zelf.__qualname__(vm); + let qualname = qualname.downcast_ref::().map(|n| n.as_wtf8()); let name = zelf.name(); - format!( - "", - module, - zelf.__qualname__(vm) - .downcast_ref::() - .map(|n| n.as_str()) - .unwrap_or_else(|| &name) - ) + let qualname = qualname.unwrap_or_else(|| name.as_ref()); + format!("") } _ => format!("", zelf.slot_name()), }; diff --git a/crates/vm/src/builtins/weakproxy.rs b/crates/vm/src/builtins/weakproxy.rs index 437e0dc886e..a03eb1fe7e7 100644 --- a/crates/vm/src/builtins/weakproxy.rs +++ b/crates/vm/src/builtins/weakproxy.rs @@ -142,7 +142,7 @@ impl IterNext for PyWeakProxy { fn new_reference_error(vm: &VirtualMachine) -> PyRef { vm.new_exception_msg( vm.ctx.exceptions.reference_error.to_owned(), - "weakly-referenced object no longer exists".to_owned(), + "weakly-referenced object no longer exists".into(), ) } diff --git a/crates/vm/src/bytes_inner.rs b/crates/vm/src/bytes_inner.rs index 7e1c1c2220c..d8e4a6c8eff 100644 --- a/crates/vm/src/bytes_inner.rs +++ b/crates/vm/src/bytes_inner.rs @@ -4,7 +4,7 @@ use crate::{ anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper}, builtins::{ PyBaseExceptionRef, PyByteArray, PyBytes, PyBytesRef, PyInt, PyIntRef, PyStr, PyStrRef, - pystr, + pystr, pystr::PyUtf8StrRef, }, byte::bytes_from_object, cformat::cformat_bytes, @@ -45,16 +45,16 @@ pub struct ByteInnerNewOptions { #[pyarg(any, optional)] pub source: OptionalArg, #[pyarg(any, optional)] - pub encoding: OptionalArg, + pub encoding: OptionalArg, #[pyarg(any, optional)] - pub errors: OptionalArg, + pub errors: OptionalArg, } impl ByteInnerNewOptions { fn get_value_from_string( s: PyStrRef, - encoding: PyStrRef, - errors: OptionalArg, + encoding: PyUtf8StrRef, + errors: OptionalArg, vm: &VirtualMachine, ) -> PyResult { let bytes = pystr::encode_string(s, Some(encoding), errors.into_option(), vm)?; @@ -1114,9 +1114,9 @@ impl AnyStr for [u8] { #[derive(FromArgs)] pub struct DecodeArgs { #[pyarg(any, default)] - encoding: Option, + encoding: Option, #[pyarg(any, default)] - errors: Option, + errors: Option, } pub fn bytes_decode( @@ -1125,9 +1125,10 @@ pub fn bytes_decode( vm: &VirtualMachine, ) -> PyResult { let DecodeArgs { encoding, errors } = args; - let encoding = encoding - .as_ref() - .map_or(crate::codecs::DEFAULT_ENCODING, |s| s.as_str()); + let encoding = match encoding.as_ref() { + None => crate::codecs::DEFAULT_ENCODING, + Some(e) => e.as_str(), + }; vm.state .codec_registry .decode_text(zelf, encoding, errors, vm) @@ -1205,7 +1206,7 @@ pub fn bytes_to_hex( let b_guard; let sep = match &sep { Either::A(s) => { - s_guard = s.as_str(); + s_guard = s.as_wtf8(); s_guard.as_bytes() } Either::B(bytes) => { diff --git a/crates/vm/src/codecs.rs b/crates/vm/src/codecs.rs index a7eea0c9372..94b4b67e33c 100644 --- a/crates/vm/src/codecs.rs +++ b/crates/vm/src/codecs.rs @@ -10,9 +10,12 @@ use rustpython_common::{ use crate::common::lock::OnceCell; use crate::{ - AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyResult, TryFromBorrowedObject, - TryFromObject, VirtualMachine, - builtins::{PyBaseExceptionRef, PyBytes, PyBytesRef, PyStr, PyStrRef, PyTuple, PyTupleRef}, + AsObject, Context, Py, PyObject, PyObjectRef, PyResult, TryFromBorrowedObject, TryFromObject, + VirtualMachine, + builtins::{ + PyBaseExceptionRef, PyBytes, PyBytesRef, PyStr, PyStrRef, PyTuple, PyTupleRef, PyUtf8Str, + PyUtf8StrRef, + }, common::{ascii, lock::PyRwLock}, convert::ToPyObject, function::{ArgBytesLike, PyMethodDef}, @@ -71,11 +74,11 @@ impl PyCodec { pub fn encode( &self, obj: PyObjectRef, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let args = match errors { - Some(errors) => vec![obj, errors.into()], + Some(errors) => vec![obj, errors.into_wtf8().into()], None => vec![obj], }; let res = self.get_encode_func().call(args, vm)?; @@ -91,11 +94,11 @@ impl PyCodec { pub fn decode( &self, obj: PyObjectRef, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let args = match errors { - Some(errors) => vec![obj, errors.into()], + Some(errors) => vec![obj, errors.into_wtf8().into()], None => vec![obj], }; let res = self.get_decode_func().call(args, vm)?; @@ -263,7 +266,7 @@ impl CodecsRegistry { } inner.search_path.clone() }; - let encoding = PyStr::from(encoding.into_owned()).into_ref(&vm.ctx); + let encoding: PyUtf8StrRef = vm.ctx.new_utf8_str(encoding.as_ref()); for func in search_path { let res = func.call((encoding.clone(),), vm)?; let res: Option = res.try_into_value(vm)?; @@ -305,7 +308,7 @@ impl CodecsRegistry { &self, obj: PyObjectRef, encoding: &str, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let codec = self.lookup(encoding, vm)?; @@ -318,7 +321,7 @@ impl CodecsRegistry { &self, obj: PyObjectRef, encoding: &str, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let codec = self.lookup(encoding, vm)?; @@ -331,7 +334,7 @@ impl CodecsRegistry { &self, obj: PyStrRef, encoding: &str, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let codec = self._lookup_text_encoding(encoding, "codecs.encode()", vm)?; @@ -355,7 +358,7 @@ impl CodecsRegistry { &self, obj: PyObjectRef, encoding: &str, - errors: Option, + errors: Option, vm: &VirtualMachine, ) -> PyResult { let codec = self._lookup_text_encoding(encoding, "codecs.decode()", vm)?; @@ -868,7 +871,7 @@ impl<'a> DecodeErrorHandler> for StandardError { } pub struct ErrorsHandler<'a> { - errors: &'a Py, + errors: &'a Py, resolved: OnceCell, } enum ResolvedError { @@ -878,14 +881,14 @@ enum ResolvedError { impl<'a> ErrorsHandler<'a> { #[inline] - pub fn new(errors: Option<&'a Py>, vm: &VirtualMachine) -> Self { + pub fn new(errors: Option<&'a Py>, vm: &VirtualMachine) -> Self { match errors { Some(errors) => Self { errors, resolved: OnceCell::new(), }, None => Self { - errors: identifier!(vm, strict).as_ref(), + errors: identifier_utf8!(vm, strict), resolved: OnceCell::from(ResolvedError::Standard(StandardError::Strict)), }, } @@ -895,12 +898,13 @@ impl<'a> ErrorsHandler<'a> { if let Some(val) = self.resolved.get() { return Ok(val); } - let val = if let Ok(standard) = self.errors.as_str().parse() { + let errors_str = self.errors.as_str(); + let val = if let Ok(standard) = errors_str.parse() { ResolvedError::Standard(standard) } else { vm.state .codec_registry - .lookup_error(self.errors.as_str(), vm) + .lookup_error(errors_str, vm) .map(ResolvedError::Handler)? }; let _ = self.resolved.set(val); @@ -1020,7 +1024,7 @@ where // let err = err. let range = extract_unicode_error_range(&err, vm)?; let s = PyStrRef::try_from_object(vm, err.get_attr("object", vm)?)?; - let s_encoding = PyStrRef::try_from_object(vm, err.get_attr("encoding", vm)?)?; + let s_encoding = PyUtf8StrRef::try_from_object(vm, err.get_attr("encoding", vm)?)?; let mut ctx = PyEncodeContext { vm, encoding: s_encoding.as_str(), @@ -1059,7 +1063,7 @@ where { let range = extract_unicode_error_range(&err, vm)?; let s = ArgBytesLike::try_from_object(vm, err.get_attr("object", vm)?)?; - let s_encoding = PyStrRef::try_from_object(vm, err.get_attr("encoding", vm)?)?; + let s_encoding = PyUtf8StrRef::try_from_object(vm, err.get_attr("encoding", vm)?)?; let mut ctx = PyDecodeContext { vm, encoding: s_encoding.as_str(), diff --git a/crates/vm/src/coroutine.rs b/crates/vm/src/coroutine.rs index 857d7403d5c..ac7aeba5443 100644 --- a/crates/vm/src/coroutine.rs +++ b/crates/vm/src/coroutine.rs @@ -257,7 +257,7 @@ impl Coro { format!( "<{} object {} at {:#x}>", gen_name(jen, vm), - qualname.as_str(), + qualname.as_wtf8(), id ) } diff --git a/crates/vm/src/dict_inner.rs b/crates/vm/src/dict_inner.rs index f2a379d99a5..fe5f40b8f33 100644 --- a/crates/vm/src/dict_inner.rs +++ b/crates/vm/src/dict_inner.rs @@ -5,7 +5,7 @@ use crate::{ AsObject, Py, PyExact, PyObject, PyObjectRef, PyRefExact, PyResult, VirtualMachine, - builtins::{PyBytes, PyInt, PyStr, PyStrInterned, PyStrRef}, + builtins::{PyBytes, PyInt, PyStr, PyStrInterned, PyStrRef, PyUtf8Str, PyUtf8StrRef}, convert::ToPyObject, }; use crate::{ @@ -818,6 +818,33 @@ impl DictKey for Py { } } +impl DictKey for Py { + type Owned = PyUtf8StrRef; + #[inline(always)] + fn _to_owned(&self, _vm: &VirtualMachine) -> Self::Owned { + self.to_owned() + } + + #[inline] + fn key_hash(&self, vm: &VirtualMachine) -> PyResult { + self.as_pystr().key_hash(vm) + } + + #[inline(always)] + fn key_is(&self, other: &PyObject) -> bool { + self.as_pystr().key_is(other) + } + + fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObject) -> PyResult { + self.as_pystr().key_eq(vm, other_key) + } + + #[inline(always)] + fn key_as_isize(&self, vm: &VirtualMachine) -> PyResult { + self.as_pystr().key_as_isize(vm) + } +} + impl DictKey for PyStrInterned { type Owned = PyRefExact; diff --git a/crates/vm/src/exception_group.rs b/crates/vm/src/exception_group.rs index 7ad27c078af..f6abdee0fab 100644 --- a/crates/vm/src/exception_group.rs +++ b/crates/vm/src/exception_group.rs @@ -8,6 +8,8 @@ use crate::types::{PyTypeFlags, PyTypeSlots}; use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyRef, PyResult, TryFromObject, VirtualMachine, }; +use core::fmt::Write; +use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; use crate::exceptions::types::PyBaseException; @@ -183,12 +185,7 @@ pub(super) mod types { #[pymethod] fn __str__(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let message = zelf - .get_arg(0) - .map(|m| m.str(vm)) - .transpose()? - .map(|s| s.as_str().to_owned()) - .unwrap_or_default(); + let message = zelf.get_arg(0).map(|m| m.str(vm)).transpose()?; let num_excs = zelf .get_arg(1) @@ -196,10 +193,12 @@ pub(super) mod types { .unwrap_or(0); let suffix = if num_excs == 1 { "" } else { "s" }; - Ok(vm.ctx.new_str(format!( - "{} ({} sub-exception{})", - message, num_excs, suffix - ))) + let mut result = match message { + Some(s) => s.as_wtf8().to_owned(), + None => Wtf8Buf::new(), + }; + write!(result, " ({num_excs} sub-exception{suffix})").unwrap(); + Ok(vm.ctx.new_str(result)) } #[pyslot] @@ -208,31 +207,28 @@ pub(super) mod types { .downcast_ref::() .expect("exception group must be BaseException"); let class_name = zelf.class().name().to_owned(); - let message = zelf - .get_arg(0) - .map(|m| m.repr(vm)) - .transpose()? - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|| "''".to_owned()); - - // Format exceptions as list [exc1, exc2, ...] instead of tuple (exc1, exc2, ...) - // CPython displays exceptions in list format even though they're stored as tuple - let exceptions_str = if let Some(exceptions_obj) = zelf.get_arg(1) { - // Get exceptions using ArgIterable for robustness + let message = zelf.get_arg(0).map(|m| m.repr(vm)).transpose()?; + + let mut result = Wtf8Buf::new(); + write!(result, "{class_name}(").unwrap(); + let message_wtf8: &Wtf8 = message.as_ref().map_or("''".as_ref(), |s| s.as_wtf8()); + result.push_wtf8(message_wtf8); + result.push_str(", ["); + if let Some(exceptions_obj) = zelf.get_arg(1) { let iter: ArgIterable = ArgIterable::try_from_object(vm, exceptions_obj.clone())?; - let mut exc_repr_list = Vec::new(); + let mut first = true; for exc in iter.iter(vm)? { - exc_repr_list.push(exc?.repr(vm)?.as_str().to_owned()); + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(exc?.repr(vm)?.as_wtf8()); } - format!("[{}]", exc_repr_list.join(", ")) - } else { - "[]".to_owned() - }; + } + result.push_str("])"); - Ok(vm - .ctx - .new_str(format!("{}({}, {})", class_name, message, exceptions_str))) + Ok(vm.ctx.new_str(result)) } } diff --git a/crates/vm/src/exceptions.rs b/crates/vm/src/exceptions.rs index 027dbec6964..680a914889b 100644 --- a/crates/vm/src/exceptions.rs +++ b/crates/vm/src/exceptions.rs @@ -211,8 +211,16 @@ impl VirtualMachine { if let Some(text) = maybe_text { // if text ends with \n or \r\n, remove it - let r_text = text.as_str().trim_end_matches(['\n', '\r']); - let l_text = r_text.trim_start_matches([' ', '\n', '\x0c']); // \x0c is \f + use rustpython_common::wtf8::CodePoint; + let text_wtf8 = text.as_wtf8(); + let r_text = text_wtf8.trim_end_matches(|cp: CodePoint| { + cp == CodePoint::from_char('\n') || cp == CodePoint::from_char('\r') + }); + let l_text = r_text.trim_start_matches(|cp: CodePoint| { + cp == CodePoint::from_char(' ') + || cp == CodePoint::from_char('\n') + || cp == CodePoint::from_char('\x0c') // \f + }); let spaces = (r_text.len() - l_text.len()) as isize; writeln!(output, " {l_text}")?; @@ -249,9 +257,9 @@ impl VirtualMachine { let end_colno = end_offset - 1 - spaces; if colno >= 0 { let caret_space = l_text - .chars() + .code_points() .take(colno as usize) - .map(|c| if c.is_whitespace() { c } else { ' ' }) + .map(|cp| cp.to_char().filter(|c| c.is_whitespace()).unwrap_or(' ')) .collect::(); let mut error_width = end_colno - colno; @@ -716,7 +724,7 @@ impl PyRef { for (key, value) in &dict { let key_str = key.str(vm)?; - if key_str.as_str().starts_with("__") { + if key_str.as_bytes().starts_with(b"__") { continue; } self.as_object().set_attr(&key_str, value.clone(), vm)?; @@ -1459,7 +1467,10 @@ pub(super) mod types { }; use crossbeam_utils::atomic::AtomicCell; use itertools::Itertools; - use rustpython_common::str::UnicodeEscapeCodepoint; + use rustpython_common::{ + str::UnicodeEscapeCodepoint, + wtf8::{Wtf8, Wtf8Buf, wtf8_concat}, + }; // Re-export exception group types from dedicated module pub use crate::exception_group::types::PyBaseExceptionGroup; @@ -2317,19 +2328,25 @@ pub(super) mod types { impl PySyntaxError { #[pymethod] fn __str__(zelf: &Py, vm: &VirtualMachine) -> PyResult { - fn basename(filename: &str) -> &str { - let splitted = if cfg!(windows) { - filename.rsplit(&['/', '\\']).next() + fn basename(filename: &Wtf8) -> &Wtf8 { + let bytes = filename.as_bytes(); + let pos = if cfg!(windows) { + bytes.iter().rposition(|&b| b == b'/' || b == b'\\') } else { - filename.rsplit('/').next() + bytes.iter().rposition(|&b| b == b'/') }; - splitted.unwrap_or(filename) + match pos { + // SAFETY: splitting at ASCII byte boundary preserves WTF-8 validity + Some(pos) => unsafe { Wtf8::from_bytes_unchecked(&bytes[pos + 1..]) }, + None => filename, + } } - let maybe_lineno = zelf.as_object().get_attr("lineno", vm).ok().map(|obj| { - obj.str(vm) - .unwrap_or_else(|_| vm.ctx.new_str("")) - }); + let maybe_lineno = zelf + .as_object() + .get_attr("lineno", vm) + .and_then(|obj| obj.str_utf8(vm)) + .ok(); let maybe_filename = zelf.as_object().get_attr("filename", vm).ok().map(|obj| { obj.str(vm) .unwrap_or_else(|_| vm.ctx.new_str("")) @@ -2345,17 +2362,22 @@ pub(super) mod types { } }; - let msg_with_location_info: String = match (maybe_lineno, maybe_filename) { - (Some(lineno), Some(filename)) => { - format!("{} ({}, line {})", msg, basename(filename.as_str()), lineno) - } + let msg_with_location_info: Wtf8Buf = match (maybe_lineno, maybe_filename) { + (Some(lineno), Some(filename)) => wtf8_concat!( + msg.as_wtf8(), + " (", + basename(filename.as_wtf8()), + ", line ", + lineno.as_str(), + ")" + ), (Some(lineno), None) => { - format!("{msg} (line {lineno})") + wtf8_concat!(msg.as_wtf8(), " (line ", lineno.as_str(), ")") } (None, Some(filename)) => { - format!("{} ({})", msg, basename(filename.as_str())) + wtf8_concat!(msg.as_wtf8(), " (", basename(filename.as_wtf8()), ")") } - (None, None) => msg.to_string(), + (None, None) => msg.as_wtf8().to_owned(), }; Ok(vm.ctx.new_str(msg_with_location_info)) diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 2e3905f4f14..313266eef9d 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -5,7 +5,7 @@ use crate::{ builtins::{ PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback, - PyType, + PyType, PyUtf8Str, asyncgenerator::PyAsyncGenWrappedValue, function::{PyCell, PyCellRef, PyFunction}, tuple::{PyTuple, PyTupleRef}, @@ -24,13 +24,18 @@ use crate::{ vm::{Context, PyMethod}, }; use alloc::fmt; +use bstr::ByteSlice; use core::iter::zip; use core::sync::atomic; use core::sync::atomic::AtomicPtr; use indexmap::IndexMap; use itertools::Itertools; -use rustpython_common::{boxvec::BoxVec, lock::PyMutex, wtf8::Wtf8Buf}; +use rustpython_common::{ + boxvec::BoxVec, + lock::PyMutex, + wtf8::{Wtf8, Wtf8Buf, wtf8_concat}, +}; use rustpython_compiler_core::SourceLocation; pub type FrameRef = PyRef; @@ -416,8 +421,8 @@ impl Py { pub fn is_internal_frame(&self) -> bool { let code = self.f_code(); let filename = code.co_filename(); - let filename_s = filename.as_str(); - filename_s.contains("importlib") && filename_s.contains("_bootstrap") + let filename = filename.as_bytes(); + filename.find(b"importlib").is_some() && filename.find(b"_bootstrap").is_some() } pub fn next_external_frame(&self, vm: &VirtualMachine) -> Option { @@ -795,7 +800,7 @@ impl ExecutingFrame<'_> { if let Some(&name) = self.code.cellvars.get(i) { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), - format!("local variable '{name}' referenced before assignment"), + format!("local variable '{name}' referenced before assignment").into(), ) } else { let name = self.code.freevars[i - self.code.cellvars.len()]; @@ -1052,7 +1057,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx] - ), + ) + .into(), )); } fastlocals[idx] = None; @@ -1157,7 +1163,7 @@ impl ExecutingFrame<'_> { return Err(vm.new_type_error(format!( "{} got multiple values for keyword argument '{}'", func_str, - key_str.as_str() + key_str.as_wtf8() ))); } let value = vm.call_method(&source, "__getitem__", (key.clone(),))?; @@ -1536,7 +1542,7 @@ impl ExecutingFrame<'_> { ) -> PyBaseExceptionRef { vm.new_exception_msg( vm.ctx.exceptions.unbound_local_error.to_owned(), - format!("local variable '{varname}' referenced before assignment",), + format!("local variable '{varname}' referenced before assignment").into(), ) } let idx = idx.get(arg) as usize; @@ -1566,7 +1572,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx] - ), + ) + .into(), ) })?; self.push_value(x); @@ -1585,7 +1592,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx1] - ), + ) + .into(), ) })?; let x2 = fastlocals[idx2].clone().ok_or_else(|| { @@ -1594,7 +1602,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx2] - ), + ) + .into(), ) })?; drop(fastlocals); @@ -1618,7 +1627,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx] - ), + ) + .into(), ) })?; self.push_value(x); @@ -1636,7 +1646,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx1] - ), + ) + .into(), ) })?; let x2 = fastlocals[idx2].clone().ok_or_else(|| { @@ -1645,7 +1656,8 @@ impl ExecutingFrame<'_> { format!( "local variable '{}' referenced before assignment", self.code.varnames[idx2] - ), + ) + .into(), ) })?; drop(fastlocals); @@ -1744,8 +1756,9 @@ impl ExecutingFrame<'_> { // Get type names for error message let type_name = cls .downcast::() - .map(|t| t.__name__(vm).as_str().to_owned()) - .unwrap_or_else(|_| String::from("?")); + .ok() + .and_then(|t| t.__name__(vm).to_str().map(str::to_owned)) + .unwrap_or_else(|| String::from("?")); let match_args_type_name = match_args.class().__name__(vm); return Err(vm.new_type_error(format!( "{}.__match_args__ must be a tuple (got {})", @@ -2413,7 +2426,7 @@ impl ExecutingFrame<'_> { let mod_name_obj = module.get_attr(identifier!(vm, __name__), vm).ok(); let mod_name_str = mod_name_obj .as_ref() - .and_then(|n| n.downcast_ref::().map(|s| s.as_str().to_owned())); + .and_then(|n| n.downcast_ref::().map(|s| s.as_str().to_owned())); let module_name = mod_name_str.as_deref().unwrap_or(""); let spec = module @@ -2472,7 +2485,7 @@ impl ExecutingFrame<'_> { format!("cannot import name '{name}' from '{module_name}' (unknown location)") } }; - let err = vm.new_import_error(msg, vm.ctx.new_str(module_name)); + let err = vm.new_import_error(msg, vm.ctx.new_utf8_str(module_name)); if let Some(ref path) = origin { let _ignore = err @@ -2502,14 +2515,14 @@ impl ExecutingFrame<'_> { let require_str = |obj: PyObjectRef, attr: &str| -> PyResult> { obj.downcast().map_err(|obj: PyObjectRef| { let source = if let Some(ref mod_name) = mod_name { - format!("{}.{attr}", mod_name.as_str()) + format!("{}.{attr}", mod_name.as_wtf8()) } else { attr.to_owned() }; let repr = obj.repr(vm).unwrap_or_else(|_| vm.ctx.new_str("?")); vm.new_type_error(format!( "{} in {} must be str, not {}", - repr.as_str(), + repr.as_wtf8(), source, obj.class().name() )) @@ -2528,7 +2541,7 @@ impl ExecutingFrame<'_> { } else { for (k, v) in dict { let k = require_str(k, "__dict__")?; - if !k.as_str().starts_with('_') { + if !k.as_bytes().starts_with(b"_") { self.locals.mapping().ass_subscript(&k, Some(v), vm)?; } } @@ -2650,10 +2663,13 @@ impl ExecutingFrame<'_> { .expect("kwarg names should be tuple of strings"); let args = self.pop_multiple(nargs as usize); - let kwarg_names = kwarg_names - .as_slice() - .iter() - .map(|pyobj| pyobj.downcast_ref::().unwrap().as_str().to_owned()); + let kwarg_names = kwarg_names.as_slice().iter().map(|pyobj| { + pyobj + .downcast_ref::() + .unwrap() + .as_str() + .to_owned() + }); FuncArgs::with_kwargs_names(args, kwarg_names) } @@ -2668,7 +2684,7 @@ impl ExecutingFrame<'_> { Self::iterate_mapping_keys(vm, &kw_obj, &func_str, |key| { let key_str = key - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| vm.new_type_error("keywords must be strings"))?; let value = kw_obj.get_item(&*key, vm)?; kwargs.insert(key_str.as_str().to_owned(), value); @@ -2708,26 +2724,26 @@ impl ExecutingFrame<'_> { /// Returns a display string for a callable object for use in error messages. /// For objects with `__qualname__`, returns "module.qualname()" or "qualname()". /// For other objects, returns repr(obj). - fn object_function_str(obj: &PyObject, vm: &VirtualMachine) -> String { + fn object_function_str(obj: &PyObject, vm: &VirtualMachine) -> Wtf8Buf { + let repr_fallback = || { + obj.repr(vm) + .as_ref() + .map_or("?".as_ref(), |s| s.as_wtf8()) + .to_owned() + }; let Ok(qualname) = obj.get_attr(vm.ctx.intern_str("__qualname__"), vm) else { - return obj - .repr(vm) - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| "?".to_owned()); + return repr_fallback(); }; let Some(qualname_str) = qualname.downcast_ref::() else { - return obj - .repr(vm) - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| "?".to_owned()); + return repr_fallback(); }; if let Ok(module) = obj.get_attr(vm.ctx.intern_str("__module__"), vm) && let Some(module_str) = module.downcast_ref::() - && module_str.as_str() != "builtins" + && module_str.as_bytes() != b"builtins" { - return format!("{}.{}()", module_str.as_str(), qualname_str.as_str()); + return wtf8_concat!(module_str.as_wtf8(), ".", qualname_str.as_wtf8(), "()"); } - format!("{}()", qualname_str.as_str()) + wtf8_concat!(qualname_str.as_wtf8(), "()") } /// Helper function to iterate over mapping keys using the keys() method. @@ -2735,7 +2751,7 @@ impl ExecutingFrame<'_> { fn iterate_mapping_keys( vm: &VirtualMachine, mapping: &PyObject, - func_str: &str, + func_str: &Wtf8, mut key_handler: F, ) -> PyResult<()> where diff --git a/crates/vm/src/function/buffer.rs b/crates/vm/src/function/buffer.rs index e8f835e1dac..c0dd6473bdc 100644 --- a/crates/vm/src/function/buffer.rs +++ b/crates/vm/src/function/buffer.rs @@ -172,7 +172,7 @@ impl TryFromObject for ArgAsciiBuffer { fn try_from_object(vm: &VirtualMachine, obj: PyObjectRef) -> PyResult { match obj.downcast::() { Ok(string) => { - if string.as_str().is_ascii() { + if string.as_wtf8().is_ascii() { Ok(Self::String(string)) } else { Err(vm.new_value_error("string argument should contain only ASCII characters")) @@ -186,7 +186,7 @@ impl TryFromObject for ArgAsciiBuffer { impl ArgAsciiBuffer { pub fn len(&self) -> usize { match self { - Self::String(s) => s.as_str().len(), + Self::String(s) => s.as_wtf8().len(), Self::Buffer(buffer) => buffer.len(), } } diff --git a/crates/vm/src/function/fspath.rs b/crates/vm/src/function/fspath.rs index 7d3a0dcbbd5..732fd0ca35a 100644 --- a/crates/vm/src/function/fspath.rs +++ b/crates/vm/src/function/fspath.rs @@ -106,7 +106,7 @@ impl FsPath { pub fn to_path_buf(&self, vm: &VirtualMachine) -> PyResult { let path = match self { - Self::Str(s) => PathBuf::from(s.as_str()), + Self::Str(s) => PathBuf::from(vm.fsencode(s)?.as_ref() as &OsStr), Self::Bytes(b) => PathBuf::from(Self::bytes_as_os_str(b, vm)?), }; Ok(path) diff --git a/crates/vm/src/function/mod.rs b/crates/vm/src/function/mod.rs index e86adf5f27b..15048919593 100644 --- a/crates/vm/src/function/mod.rs +++ b/crates/vm/src/function/mod.rs @@ -36,9 +36,9 @@ pub enum ArgByteOrder { impl<'a> TryFromBorrowedObject<'a> for ArgByteOrder { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { obj.try_value_with( - |s: &PyStr| match s.as_str() { - "big" => Ok(Self::Big), - "little" => Ok(Self::Little), + |s: &PyStr| match s.as_bytes() { + b"big" => Ok(Self::Big), + b"little" => Ok(Self::Little), _ => Err(vm.new_value_error("byteorder must be either 'little' or 'big'")), }, vm, diff --git a/crates/vm/src/import.rs b/crates/vm/src/import.rs index 1957ccca663..4e89052e1a8 100644 --- a/crates/vm/src/import.rs +++ b/crates/vm/src/import.rs @@ -2,7 +2,7 @@ use crate::{ AsObject, Py, PyObjectRef, PyPayload, PyRef, PyResult, - builtins::{PyCode, PyStr, PyStrRef, traceback::PyTraceback}, + builtins::{PyCode, PyStr, PyUtf8Str, PyUtf8StrRef, traceback::PyTraceback}, exceptions::types::PyBaseException, scope::Scope, vm::{VirtualMachine, resolve_frozen_alias, thread}, @@ -69,7 +69,7 @@ pub fn make_frozen(vm: &VirtualMachine, name: &str) -> PyResult> { let frozen = vm.state.frozen.get(name).ok_or_else(|| { vm.new_import_error( format!("No such frozen object named {name}"), - vm.ctx.new_str(name), + vm.ctx.new_utf8_str(name), ) })?; Ok(vm.ctx.new_code(frozen.code)) @@ -79,13 +79,13 @@ pub fn import_frozen(vm: &VirtualMachine, module_name: &str) -> PyResult { let frozen = vm.state.frozen.get(module_name).ok_or_else(|| { vm.new_import_error( format!("No such frozen object named {module_name}"), - vm.ctx.new_str(module_name), + vm.ctx.new_utf8_str(module_name), ) })?; let module = import_code_obj(vm, module_name, vm.ctx.new_code(frozen.code), false)?; debug_assert!(module.get_attr(identifier!(vm, __name__), vm).is_ok()); let origname = resolve_frozen_alias(module_name); - module.set_attr("__origname__", vm.ctx.new_str(origname), vm)?; + module.set_attr("__origname__", vm.ctx.new_utf8_str(origname), vm)?; Ok(module) } @@ -118,7 +118,7 @@ pub fn import_builtin(vm: &VirtualMachine, module_name: &str) -> PyResult { // Module not found in module_defs Err(vm.new_import_error( format!("Cannot import builtin module {module_name}"), - vm.ctx.new_str(module_name), + vm.ctx.new_utf8_str(module_name), )) } @@ -169,7 +169,7 @@ fn import_ensure_initialized( }; if initializing { let lock_unlock = vm.importlib.get_attr("_lock_unlock_module", vm)?; - lock_unlock.call((vm.ctx.new_str(name),), vm)?; + lock_unlock.call((vm.ctx.new_utf8_str(name),), vm)?; } Ok(()) } @@ -183,7 +183,7 @@ pub fn import_code_obj( let attrs = vm.ctx.new_dict(); attrs.set_item( identifier!(vm, __name__), - vm.ctx.new_str(module_name).into(), + vm.ctx.new_utf8_str(module_name).into(), vm, )?; if set_file_attr { @@ -312,7 +312,7 @@ pub(crate) fn is_possibly_shadowing_path(origin: &str, vm: &VirtualMachine) -> b let sys_path_0 = (|| -> Option { let argv = vm.sys_module.get_attr("argv", vm).ok()?; let argv0 = argv.get_item(&0usize, vm).ok()?; - let argv0_str = argv0.downcast_ref::()?; + let argv0_str = argv0.downcast_ref::()?; let s = argv0_str.as_str(); // For -c and REPL, original sys.path[0] is "" @@ -412,7 +412,7 @@ pub(crate) fn import_module_level( if package.is_empty() { return Err(vm.new_import_error( "attempted relative import with no known parent package".to_owned(), - vm.ctx.new_str(""), + vm.ctx.new_utf8_str(""), )); } resolve_name(name_str, &package, level as usize, vm)? @@ -432,7 +432,7 @@ pub(crate) fn import_module_level( } _ => { let find_and_load = vm.importlib.get_attr("_find_and_load", vm)?; - let abs_name_obj = vm.ctx.new_str(&*abs_name); + let abs_name_obj = vm.ctx.new_utf8_str(&*abs_name); find_and_load.call((abs_name_obj, vm.import_func.clone()), vm)? } }; @@ -474,14 +474,14 @@ pub(crate) fn import_module_level( // For absolute imports (level 0), try importing the // parent. Matches _bootstrap.__import__ behavior. let find_and_load = vm.importlib.get_attr("_find_and_load", vm)?; - let to_return_obj = vm.ctx.new_str(&*to_return); + let to_return_obj = vm.ctx.new_utf8_str(&*to_return); find_and_load.call((to_return_obj, vm.import_func.clone()), vm) } Err(_) => { // For relative imports (level > 0), raise KeyError let to_return_obj: PyObjectRef = vm .ctx - .new_str(format!("'{to_return}' not in sys.modules as expected")) + .new_utf8_str(format!("'{to_return}' not in sys.modules as expected")) .into(); Err(vm.new_key_error(to_return_obj)) } @@ -501,7 +501,7 @@ fn resolve_name(name: &str, package: &str, level: usize, vm: &VirtualMachine) -> if parts.len() < level { return Err(vm.new_import_error( "attempted relative import beyond top-level package".to_owned(), - vm.ctx.new_str(name), + vm.ctx.new_utf8_str(name), )); } // rsplitn returns parts right-to-left, so last() is the leftmost (base) @@ -518,7 +518,7 @@ fn calc_package(globals: Option<&PyObjectRef>, vm: &VirtualMachine) -> PyResult< let globals = globals.ok_or_else(|| { vm.new_import_error( "attempted relative import with no known parent package".to_owned(), - vm.ctx.new_str(""), + vm.ctx.new_utf8_str(""), ) })?; @@ -528,7 +528,7 @@ fn calc_package(globals: Option<&PyObjectRef>, vm: &VirtualMachine) -> PyResult< if let Some(ref pkg) = package && !vm.is_none(pkg) { - let pkg_str: PyStrRef = pkg + let pkg_str: PyUtf8StrRef = pkg .clone() .downcast() .map_err(|_| vm.new_type_error("package must be a string".to_owned()))?; @@ -543,7 +543,7 @@ fn calc_package(globals: Option<&PyObjectRef>, vm: &VirtualMachine) -> PyResult< .unwrap_or(false) { let parent_repr = parent - .repr(vm) + .repr_utf8(vm) .map(|s| s.as_str().to_owned()) .unwrap_or_default(); let msg = format!( @@ -570,7 +570,7 @@ fn calc_package(globals: Option<&PyObjectRef>, vm: &VirtualMachine) -> PyResult< && let Ok(parent) = spec.get_attr("parent", vm) && !vm.is_none(&parent) { - let parent_str: PyStrRef = parent + let parent_str: PyUtf8StrRef = parent .downcast() .map_err(|_| vm.new_type_error("package set to non-string".to_owned()))?; return Ok(parent_str.as_str().to_owned()); @@ -593,10 +593,10 @@ fn calc_package(globals: Option<&PyObjectRef>, vm: &VirtualMachine) -> PyResult< let mod_name = globals.get_item("__name__", vm).map_err(|_| { vm.new_import_error( "attempted relative import with no known parent package".to_owned(), - vm.ctx.new_str(""), + vm.ctx.new_utf8_str(""), ) })?; - let mod_name_str: PyStrRef = mod_name + let mod_name_str: PyUtf8StrRef = mod_name .downcast() .map_err(|_| vm.new_type_error("__name__ must be a string".to_owned()))?; let mut package = mod_name_str.as_str().to_owned(); diff --git a/crates/vm/src/intern.rs b/crates/vm/src/intern.rs index 8ad39c206fd..afc30e7ac04 100644 --- a/crates/vm/src/intern.rs +++ b/crates/vm/src/intern.rs @@ -152,10 +152,25 @@ impl CachedPyStrRef { } } +#[repr(transparent)] pub struct PyInterned { inner: Py, } +impl PyInterned { + /// Returns `&str` for interned strings. + /// + /// # Panics + /// Panics if the interned string contains unpaired surrogates (WTF-8 content). + /// Most interned strings are valid UTF-8, so this is an ergonomic default. + #[inline] + pub fn as_str(&self) -> &str { + self.inner + .to_str() + .unwrap_or_else(|| panic!("interned str is always valid UTF-8")) + } +} + impl PyInterned { #[inline] pub fn leak(cache: PyRef) -> &'static Self { diff --git a/crates/vm/src/macros.rs b/crates/vm/src/macros.rs index 32f3e4566ea..4fad50ac8f2 100644 --- a/crates/vm/src/macros.rs +++ b/crates/vm/src/macros.rs @@ -188,11 +188,11 @@ macro_rules! identifier( #[macro_export] macro_rules! identifier_utf8( - ($as_ctx:expr, $name:ident) => { + ($as_ctx:expr, $name:ident) => {{ // Safety: All known identifiers are ascii strings. - #[allow(clippy::macro_metavars_in_unsafe, reason = "known identifiers are ASCII and downcast target is fixed")] - unsafe { $as_ctx.as_ref().names.$name.as_object().downcast_unchecked_ref::<$crate::builtins::PyUtf8Str>() } - }; + let interned = $as_ctx.as_ref().names.$name; + unsafe { $crate::builtins::PyUtf8StrInterned::from_str_interned_unchecked(interned) } + }}; ); /// Super detailed logging. Might soon overflow your log buffers diff --git a/crates/vm/src/protocol/object.rs b/crates/vm/src/protocol/object.rs index d2e4b31aaea..78ac9903905 100644 --- a/crates/vm/src/protocol/object.rs +++ b/crates/vm/src/protocol/object.rs @@ -362,8 +362,8 @@ impl PyObject { } pub fn ascii(&self, vm: &VirtualMachine) -> PyResult { - let repr = self.repr_utf8(vm)?; - let ascii = to_ascii(repr.as_str()); + let repr = self.repr(vm)?; + let ascii = to_ascii(repr.as_wtf8()); Ok(ascii) } @@ -658,7 +658,7 @@ impl PyObject { Err(vm.new_exception_msg( vm.ctx.exceptions.type_error.to_owned(), - format!("unhashable type: '{}'", self.class().name()), + format!("unhashable type: '{}'", self.class().name()).into(), )) } diff --git a/crates/vm/src/stdlib/_abc.rs b/crates/vm/src/stdlib/_abc.rs index 72642c249f5..5657cda9865 100644 --- a/crates/vm/src/stdlib/_abc.rs +++ b/crates/vm/src/stdlib/_abc.rs @@ -365,7 +365,7 @@ mod _abc { if !ok.is(&vm.ctx.not_implemented) { return Err(vm.new_exception_msg( vm.ctx.exceptions.assertion_error.to_owned(), - "__subclasshook__ must return either False, True, or NotImplemented".to_owned(), + "__subclasshook__ must return either False, True, or NotImplemented".into(), )); } diff --git a/crates/vm/src/stdlib/_winapi.rs b/crates/vm/src/stdlib/_winapi.rs index 6a78b36f869..1e52af5aaa4 100644 --- a/crates/vm/src/stdlib/_winapi.rs +++ b/crates/vm/src/stdlib/_winapi.rs @@ -15,6 +15,7 @@ mod _winapi { windows::{WinHandle, WindowsSysResult}, }; use core::ptr::{null, null_mut}; + use rustpython_common::wtf8::Wtf8Buf; use windows_sys::Win32::Foundation::{HANDLE, MAX_PATH}; #[pyattr] @@ -296,19 +297,19 @@ mod _winapi { .map_or_else(null_mut, |l| l.attrlist.as_mut_ptr() as _); let wstr = |s: PyStrRef| { - let ws = widestring::WideCString::from_str(s.as_str()) + let ws = widestring::WideCString::from_str(s.expect_str()) .map_err(|err| err.to_pyexception(vm))?; Ok(ws.into_vec_with_nul()) }; // Validate no embedded null bytes in command name and command line if let Some(ref name) = args.name - && name.as_str().contains('\0') + && name.as_bytes().contains(&0) { return Err(crate::exceptions::cstring_error(vm)); } if let Some(ref cmd) = args.command_line - && cmd.as_str().contains('\0') + && cmd.as_bytes().contains(&0) { return Err(crate::exceptions::cstring_error(vm)); } @@ -396,8 +397,8 @@ mod _winapi { dest_path: PyStrRef, vm: &VirtualMachine, ) -> PyResult<()> { - let src_path = std::path::Path::new(src_path.as_str()); - let dest_path = std::path::Path::new(dest_path.as_str()); + let src_path = std::path::Path::new(src_path.expect_str()); + let dest_path = std::path::Path::new(dest_path.expect_str()); junction::create(src_path, dest_path).map_err(|e| e.to_pyexception(vm)) } @@ -418,9 +419,9 @@ mod _winapi { let mut last_entry: HashMap = HashMap::new(); for (k, v) in keys.into_iter().zip(values.into_iter()) { let k = PyStrRef::try_from_object(vm, k)?; - let k = k.as_str(); + let k = k.expect_str(); let v = PyStrRef::try_from_object(vm, v)?; - let v = v.as_str(); + let v = v.expect_str(); if k.contains('\0') || v.contains('\0') { return Err(crate::exceptions::cstring_error(vm)); } @@ -677,7 +678,6 @@ mod _winapi { src: PyStrRef, vm: &VirtualMachine, ) -> PyResult { - use rustpython_common::wtf8::Wtf8Buf; use windows_sys::Win32::Globalization::{ LCMAP_BYTEREV, LCMAP_HASH, LCMAP_SORTHANDLE, LCMAP_SORTKEY, LCMapStringEx as WinLCMapStringEx, @@ -1032,8 +1032,6 @@ mod _winapi { api_fn: unsafe extern "system" fn(*const u16, *mut u16, u32) -> u32, vm: &VirtualMachine, ) -> PyResult { - use rustpython_common::wtf8::Wtf8Buf; - let path_wide = path.as_wtf8().to_wide_with_nul(); // First call to get required buffer size @@ -1808,7 +1806,7 @@ mod _winapi { use windows_sys::Win32::System::Memory::CreateFileMappingW; if let Some(ref n) = name - && n.as_str().contains('\0') + && n.as_bytes().contains(&0) { return Err(vm.new_value_error( "CreateFileMapping: name must not contain null characters".to_owned(), @@ -1844,7 +1842,7 @@ mod _winapi { ) -> PyResult { use windows_sys::Win32::System::Memory::OpenFileMappingW; - if name.as_str().contains('\0') { + if name.as_bytes().contains(&0) { return Err(vm.new_value_error( "OpenFileMapping: name must not contain null characters".to_owned(), )); diff --git a/crates/vm/src/stdlib/_wmi.rs b/crates/vm/src/stdlib/_wmi.rs index f25f4b23bfe..f2b088e96a3 100644 --- a/crates/vm/src/stdlib/_wmi.rs +++ b/crates/vm/src/stdlib/_wmi.rs @@ -561,7 +561,7 @@ mod _wmi { /// by null characters. #[pyfunction] fn exec_query(query: PyStrRef, vm: &VirtualMachine) -> PyResult { - let query_str = query.as_str(); + let query_str = query.expect_str(); if !query_str .get(..7) diff --git a/crates/vm/src/stdlib/ast.rs b/crates/vm/src/stdlib/ast.rs index 92366b6e8e3..a04864634be 100644 --- a/crates/vm/src/stdlib/ast.rs +++ b/crates/vm/src/stdlib/ast.rs @@ -15,7 +15,7 @@ use crate::{ AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult, TryFromObject, VirtualMachine, builtins::PyIntRef, - builtins::{PyDict, PyModule, PyStrRef, PyType}, + builtins::{PyDict, PyModule, PyType, PyUtf8StrRef}, class::{PyClassImpl, StaticType}, compiler::{CompileError, ParseError}, convert::ToPyObject, diff --git a/crates/vm/src/stdlib/ast/basic.rs b/crates/vm/src/stdlib/ast/basic.rs index ca518eaa520..28e4a6803ee 100644 --- a/crates/vm/src/stdlib/ast/basic.rs +++ b/crates/vm/src/stdlib/ast/basic.rs @@ -13,7 +13,7 @@ impl Node for ast::Identifier { _source_file: &SourceFile, object: PyObjectRef, ) -> PyResult { - let py_str = PyStrRef::try_from_object(vm, object)?; + let py_str = PyUtf8StrRef::try_from_object(vm, object)?; Ok(Self::new(py_str.as_str(), TextRange::default())) } } diff --git a/crates/vm/src/stdlib/ast/python.rs b/crates/vm/src/stdlib/ast/python.rs index 0de6f45b912..acc36202cd1 100644 --- a/crates/vm/src/stdlib/ast/python.rs +++ b/crates/vm/src/stdlib/ast/python.rs @@ -8,8 +8,9 @@ use super::{ pub(crate) mod _ast { use crate::{ AsObject, Context, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, - builtins::{PyStr, PyStrRef, PyTupleRef, PyType, PyTypeRef}, + builtins::{PyStr, PyStrRef, PyTupleRef, PyType, PyTypeRef, PyUtf8Str, PyUtf8StrRef}, class::{PyClassImpl, StaticType}, + common::wtf8::Wtf8, function::{FuncArgs, KwArgs, PyMethodDef, PyMethodFlags}, stdlib::ast::repr, types::{Constructor, Initializer}, @@ -99,7 +100,7 @@ pub(crate) mod _ast { let fields: Vec = fields.try_to_value(vm)?; let mut positional: Vec = Vec::new(); for field in fields { - if dict.get_item_opt::(field.as_str(), vm)?.is_some() { + if dict.get_item_opt::(field.as_wtf8(), vm)?.is_some() { positional.push(vm.ctx.none()); } else { break; @@ -127,13 +128,13 @@ pub(crate) mod _ast { let mut expecting: std::collections::HashSet = std::collections::HashSet::new(); if let Some(fields) = fields.clone() { - let fields: Vec = fields.try_to_value(vm)?; + let fields: Vec = fields.try_to_value(vm)?; for field in fields { expecting.insert(field.as_str().to_owned()); } } if let Some(attributes) = attributes.clone() { - let attributes: Vec = attributes.try_to_value(vm)?; + let attributes: Vec = attributes.try_to_value(vm)?; for attr in attributes { expecting.insert(attr.as_str().to_owned()); } @@ -151,12 +152,12 @@ pub(crate) mod _ast { if let Some(dict) = dict.as_ref() { for (key, _value) in dict.items_vec() { - if let Ok(key) = key.downcast::() { + if let Ok(key) = key.downcast::() { expecting.remove(key.as_str()); } } if let Some(attributes) = attributes.clone() { - let attributes: Vec = attributes.try_to_value(vm)?; + let attributes: Vec = attributes.try_to_value(vm)?; for attr in attributes { expecting.remove(attr.as_str()); } @@ -168,7 +169,7 @@ pub(crate) mod _ast { && let Ok(field_types) = field_types.downcast::() { for (key, value) in field_types.items_vec() { - let Ok(key) = key.downcast::() else { + let Ok(key) = key.downcast::() else { continue; }; if value.fast_isinstance(vm.ctx.types.union_type) { @@ -199,7 +200,7 @@ pub(crate) mod _ast { if let Some(fields) = fields.clone() { let fields: Vec = fields.try_to_value(vm)?; for field in fields { - if let Some(value) = dict.get_item_opt::(field.as_str(), vm)? { + if let Some(value) = dict.get_item_opt::(field.as_wtf8(), vm)? { payload.set_item(field.as_object(), value, vm)?; } } @@ -207,7 +208,7 @@ pub(crate) mod _ast { if let Some(attributes) = attributes.clone() { let attributes: Vec = attributes.try_to_value(vm)?; for attr in attributes { - if let Some(value) = dict.get_item_opt::(attr.as_str(), vm)? { + if let Some(value) = dict.get_item_opt::(attr.as_wtf8(), vm)? { payload.set_item(attr.as_object(), value, vm)?; } } @@ -223,7 +224,7 @@ pub(crate) mod _ast { .into_iter() .map(|(key, value)| { let key = key - .downcast::() + .downcast::() .map_err(|_| vm.new_type_error("keywords must be strings".to_owned()))?; Ok((key.as_str().to_owned(), value)) }) @@ -292,7 +293,7 @@ pub(crate) mod _ast { zelf.class().name() )) })?; - let fields: Vec = fields.try_to_value(vm)?; + let fields: Vec = fields.try_to_value(vm)?; let n_args = args.args.len(); if n_args > fields.len() { return Err(vm.new_type_error(format!( @@ -309,10 +310,10 @@ pub(crate) mod _ast { for (name, arg) in fields.iter().zip(args.args) { zelf.set_attr(name, arg, vm)?; - set_fields.insert(name.as_str().to_string()); + set_fields.insert(name.as_str().to_owned()); } for (key, value) in args.kwargs { - if let Some(pos) = fields.iter().position(|f| f.as_str() == key) + if let Some(pos) = fields.iter().position(|f| f.as_bytes() == key.as_bytes()) && pos < n_args { return Err(vm.new_type_error(format!( @@ -322,7 +323,10 @@ pub(crate) mod _ast { ))); } - if fields.iter().all(|field| field.as_str() != key) { + if fields + .iter() + .all(|field| field.as_bytes() != key.as_bytes()) + { let attrs = if let Some(attrs) = &attributes { attrs } else { @@ -334,7 +338,7 @@ pub(crate) mod _ast { attributes = Some(attrs); attributes.as_ref().unwrap() }; - if attrs.iter().all(|attr| attr.as_str() != key) { + if attrs.iter().all(|attr| attr.as_bytes() != key.as_bytes()) { let message = vm.ctx.new_str(format!( "{}.__init__ got an unexpected keyword argument '{}'. \ Support for arbitrary keyword arguments is deprecated and will be removed in Python 3.15.", @@ -368,13 +372,13 @@ Support for arbitrary keyword arguments is deprecated and will be removed in Pyt if set_fields.contains(field.as_str()) { continue; } - if let Some(ftype) = ft_dict.get_item_opt::(field.as_str(), vm)? { + if let Some(ftype) = ft_dict.get_item_opt::(field.as_wtf8(), vm)? { if ftype.fast_isinstance(vm.ctx.types.union_type) { // Optional field (T | None) — no default } else if ftype.fast_isinstance(vm.ctx.types.generic_alias_type) { // List field (list[T]) — default to [] let empty_list: PyObjectRef = vm.ctx.new_list(vec![]).into(); - zelf.set_attr(vm.ctx.intern_str(field.as_str()), empty_list, vm)?; + zelf.set_attr(vm.ctx.intern_str(field.as_wtf8()), empty_list, vm)?; } else if ftype.is(&expr_ctx_type) { // expr_context — default to Load() let load_type = @@ -384,13 +388,13 @@ Support for arbitrary keyword arguments is deprecated and will be removed in Pyt .unwrap_or_else(|| { vm.ctx.new_base_object(load_type, Some(vm.ctx.new_dict())) }); - zelf.set_attr(vm.ctx.intern_str(field.as_str()), load_instance, vm)?; + zelf.set_attr(vm.ctx.intern_str(field.as_wtf8()), load_instance, vm)?; } else { - // Required field missing: emit DeprecationWarning (CPython behavior). + // Required field missing: emit DeprecationWarning. let message = vm.ctx.new_str(format!( "{}.__init__ missing 1 required positional argument: '{}'", zelf.class().name(), - field.as_str() + field.as_wtf8() )); warn::warn( message.into(), diff --git a/crates/vm/src/stdlib/ast/repr.rs b/crates/vm/src/stdlib/ast/repr.rs index 0810814cd06..0b7f903f807 100644 --- a/crates/vm/src/stdlib/ast/repr.rs +++ b/crates/vm/src/stdlib/ast/repr.rs @@ -4,14 +4,15 @@ use crate::{ class::PyClassImpl, stdlib::ast::NodeAst, }; +use rustpython_common::wtf8::Wtf8Buf; -fn repr_ast_list(vm: &VirtualMachine, items: Vec, depth: usize) -> PyResult { +fn repr_ast_list(vm: &VirtualMachine, items: Vec, depth: usize) -> PyResult { if items.is_empty() { let empty_list: PyObjectRef = vm.ctx.new_list(vec![]).into(); - return Ok(empty_list.repr(vm)?.to_string()); + return Ok(empty_list.repr(vm)?.as_wtf8().to_owned()); } - let mut parts: Vec = Vec::new(); + let mut parts: Vec = Vec::new(); let first = &items[0]; let last = items.last().unwrap(); @@ -22,38 +23,38 @@ fn repr_ast_list(vm: &VirtualMachine, items: Vec, depth: usize) -> let repr = if item.fast_isinstance(&NodeAst::make_class(&vm.ctx)) { repr_ast_node(vm, item, depth.saturating_sub(1))? } else { - item.repr(vm)?.to_string() + item.repr(vm)?.as_wtf8().to_owned() }; parts.push(repr); } - let mut rendered = String::from("["); + let mut rendered = Wtf8Buf::from("["); if !parts.is_empty() { - rendered.push_str(&parts[0]); + rendered.push_wtf8(&parts[0]); } if items.len() > 2 { if !parts[0].is_empty() { - rendered.push_str(", ..."); + rendered.push_wtf8(", ...".as_ref()); } if parts.len() > 1 { - rendered.push_str(", "); - rendered.push_str(&parts[1]); + rendered.push_wtf8(", ".as_ref()); + rendered.push_wtf8(&parts[1]); } } else if parts.len() > 1 { - rendered.push_str(", "); - rendered.push_str(&parts[1]); + rendered.push_wtf8(", ".as_ref()); + rendered.push_wtf8(&parts[1]); } - rendered.push(']'); + rendered.push_wtf8("]".as_ref()); Ok(rendered) } -fn repr_ast_tuple(vm: &VirtualMachine, items: Vec, depth: usize) -> PyResult { +fn repr_ast_tuple(vm: &VirtualMachine, items: Vec, depth: usize) -> PyResult { if items.is_empty() { let empty_tuple: PyObjectRef = vm.ctx.empty_tuple.clone().into(); - return Ok(empty_tuple.repr(vm)?.to_string()); + return Ok(empty_tuple.repr(vm)?.as_wtf8().to_owned()); } - let mut parts: Vec = Vec::new(); + let mut parts: Vec = Vec::new(); let first = &items[0]; let last = items.last().unwrap(); @@ -64,31 +65,31 @@ fn repr_ast_tuple(vm: &VirtualMachine, items: Vec, depth: usize) -> let repr = if item.fast_isinstance(&NodeAst::make_class(&vm.ctx)) { repr_ast_node(vm, item, depth.saturating_sub(1))? } else { - item.repr(vm)?.to_string() + item.repr(vm)?.as_wtf8().to_owned() }; parts.push(repr); } - let mut rendered = String::from("("); + let mut rendered = Wtf8Buf::from("("); if !parts.is_empty() { - rendered.push_str(&parts[0]); + rendered.push_wtf8(&parts[0]); } if items.len() > 2 { if !parts[0].is_empty() { - rendered.push_str(", ..."); + rendered.push_wtf8(", ...".as_ref()); } if parts.len() > 1 { - rendered.push_str(", "); - rendered.push_str(&parts[1]); + rendered.push_wtf8(", ".as_ref()); + rendered.push_wtf8(&parts[1]); } } else if parts.len() > 1 { - rendered.push_str(", "); - rendered.push_str(&parts[1]); + rendered.push_wtf8(", ".as_ref()); + rendered.push_wtf8(&parts[1]); } if items.len() == 1 { - rendered.push(','); + rendered.push_wtf8(",".as_ref()); } - rendered.push(')'); + rendered.push_wtf8(")".as_ref()); Ok(rendered) } @@ -96,25 +97,32 @@ pub(crate) fn repr_ast_node( vm: &VirtualMachine, obj: &PyObjectRef, depth: usize, -) -> PyResult { +) -> PyResult { let cls = obj.class(); if depth == 0 { - return Ok(format!("{}(...)", cls.name())); + let mut s = Wtf8Buf::from(&*cls.name()); + s.push_wtf8("(...)".as_ref()); + return Ok(s); } let fields = cls.get_attr(vm.ctx.intern_str("_fields")); let fields = match fields { Some(fields) => fields.try_to_value::>(vm)?, - None => return Ok(format!("{}(...)", cls.name())), + None => { + let mut s = Wtf8Buf::from(&*cls.name()); + s.push_wtf8("(...)".as_ref()); + return Ok(s); + } }; if fields.is_empty() { - return Ok(format!("{}()", cls.name())); + let mut s = Wtf8Buf::from(&*cls.name()); + s.push_wtf8("()".as_ref()); + return Ok(s); } - let mut rendered = String::new(); - rendered.push_str(&cls.name()); - rendered.push('('); + let mut rendered = Wtf8Buf::from(&*cls.name()); + rendered.push_wtf8("(".as_ref()); for (idx, field) in fields.iter().enumerate() { let value = obj.get_attr(field, vm)?; @@ -131,17 +139,17 @@ pub(crate) fn repr_ast_node( } else if value.fast_isinstance(&NodeAst::make_class(&vm.ctx)) { repr_ast_node(vm, &value, depth.saturating_sub(1))? } else { - value.repr(vm)?.to_string() + value.repr(vm)?.as_wtf8().to_owned() }; if idx > 0 { - rendered.push_str(", "); + rendered.push_wtf8(", ".as_ref()); } - rendered.push_str(field.as_str()); - rendered.push('='); - rendered.push_str(&value_repr); + rendered.push_wtf8(field.as_wtf8()); + rendered.push_wtf8("=".as_ref()); + rendered.push_wtf8(&value_repr); } - rendered.push(')'); + rendered.push_wtf8(")".as_ref()); Ok(rendered) } diff --git a/crates/vm/src/stdlib/builtins.rs b/crates/vm/src/stdlib/builtins.rs index 1b54a26e732..c09e025a44a 100644 --- a/crates/vm/src/stdlib/builtins.rs +++ b/crates/vm/src/stdlib/builtins.rs @@ -11,6 +11,7 @@ mod builtins { AsObject, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, builtins::{ PyByteArray, PyBytes, PyDictRef, PyStr, PyStrRef, PyTuple, PyTupleRef, PyType, + PyUtf8StrRef, enumerate::PyReverseSequenceIterator, function::{PyCellRef, PyFunction}, int::PyIntRef, @@ -30,7 +31,7 @@ mod builtins { types::PyComparisonOp, }; use itertools::Itertools; - use num_traits::{Signed, ToPrimitive}; + use num_traits::{Signed, ToPrimitive, Zero}; use rustpython_common::wtf8::CodePoint; #[cfg(not(feature = "rustpython-compiler"))] @@ -65,7 +66,7 @@ mod builtins { #[pyfunction] pub fn ascii(obj: PyObjectRef, vm: &VirtualMachine) -> PyResult { let repr = obj.repr(vm)?; - let ascii = to_ascii(repr.as_str()); + let ascii = to_ascii(repr.as_wtf8()); Ok(ascii) } @@ -99,7 +100,7 @@ mod builtins { struct CompileArgs { source: PyObjectRef, filename: FsPath, - mode: PyStrRef, + mode: PyUtf8StrRef, #[pyarg(any, optional)] flags: OptionalArg, #[pyarg(any, optional)] @@ -139,7 +140,6 @@ mod builtins { .source .fast_isinstance(&ast::NodeAst::make_class(&vm.ctx)) { - use num_traits::Zero; let flags: i32 = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?; let is_ast_only = !(flags & ast::PY_CF_ONLY_AST).is_zero(); @@ -197,7 +197,7 @@ mod builtins { #[cfg(feature = "parser")] { use crate::convert::ToPyException; - use num_traits::Zero; + use ruff_python_parser as parser; let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?; @@ -413,7 +413,7 @@ mod builtins { if source.contains(&0) { return Err(vm.new_exception_msg( vm.ctx.exceptions.syntax_error.to_owned(), - "source code string cannot contain null bytes".to_owned(), + "source code string cannot contain null bytes".into(), )); } @@ -424,9 +424,9 @@ mod builtins { err.valid_up_to() ); - vm.new_exception_msg(vm.ctx.exceptions.syntax_error.to_owned(), msg) + vm.new_exception_msg(vm.ctx.exceptions.syntax_error.to_owned(), msg.into()) })?; - Ok(Either::A(vm.ctx.new_str(source.trim_start()))) + Ok(Either::A(vm.ctx.new_utf8_str(source.trim_start()))) } Either::B(code) => Ok(Either::B(code)), }?; @@ -435,7 +435,7 @@ mod builtins { #[pyfunction] fn exec( - source: Either>, + source: Either>, scope: ScopeArgs, vm: &VirtualMachine, ) -> PyResult { @@ -445,7 +445,7 @@ mod builtins { fn run_code( vm: &VirtualMachine, - source: Either>, + source: Either>, scope: crate::scope::Scope, #[allow(unused_variables)] mode: crate::compiler::Mode, func: &str, @@ -453,9 +453,11 @@ mod builtins { // Determine code object: let code_obj = match source { #[cfg(feature = "rustpython-compiler")] - Either::A(string) => vm - .compile(string.as_str(), mode, "".to_owned()) - .map_err(|err| vm.new_syntax_error(&err, Some(string.as_str())))?, + Either::A(string) => { + let source = string.as_str(); + vm.compile(source, mode, "".to_owned()) + .map_err(|err| vm.new_syntax_error(&err, Some(source)))? + } #[cfg(not(feature = "rustpython-compiler"))] Either::A(_) => return Err(vm.new_type_error(CODEGEN_NOT_SUPPORTED.to_owned())), Either::B(code_obj) => code_obj, @@ -567,8 +569,15 @@ mod builtins { && std::io::stdin().is_terminal() && !is_pty_child(); + // Disable rustyline if prompt contains surrogates (not valid UTF-8 for terminal) + let prompt_str = match &prompt { + OptionalArg::Present(s) => s.to_str(), + OptionalArg::Missing => Some(""), + }; + let use_rustyline = use_rustyline && prompt_str.is_some(); + if use_rustyline { - let prompt = prompt.as_ref().map_or("", |s| s.as_str()); + let prompt = prompt_str.unwrap(); let mut readline = Readline::new(()); match readline.readline(prompt) { ReadlineResult::Line(s) => Ok(vm.ctx.new_str(s).into()), diff --git a/crates/vm/src/stdlib/codecs.rs b/crates/vm/src/stdlib/codecs.rs index 85869e066cb..6c37ee4c9f9 100644 --- a/crates/vm/src/stdlib/codecs.rs +++ b/crates/vm/src/stdlib/codecs.rs @@ -42,40 +42,59 @@ mod _codecs { struct CodeArgs { obj: PyObjectRef, #[pyarg(any, optional)] - encoding: Option, + encoding: Option, #[pyarg(any, optional)] - errors: Option, + errors: Option, + } + + impl CodeArgs { + fn apply( + self, + vm: &VirtualMachine, + f: fn( + &codecs::CodecsRegistry, + PyObjectRef, + &str, + Option, + &VirtualMachine, + ) -> PyResult, + ) -> PyResult { + let encoding = self + .encoding + .as_deref() + .map(|s| s.as_str()) + .unwrap_or(codecs::DEFAULT_ENCODING); + f( + &vm.state.codec_registry, + self.obj, + encoding, + self.errors, + vm, + ) + } } #[pyfunction] fn encode(args: CodeArgs, vm: &VirtualMachine) -> PyResult { - let encoding = args - .encoding - .as_ref() - .map_or(codecs::DEFAULT_ENCODING, |s| s.as_str()); - vm.state - .codec_registry - .encode(args.obj, encoding, args.errors, vm) + args.apply(vm, codecs::CodecsRegistry::encode) } #[pyfunction] fn decode(args: CodeArgs, vm: &VirtualMachine) -> PyResult { - let encoding = args - .encoding - .as_ref() - .map_or(codecs::DEFAULT_ENCODING, |s| s.as_str()); - vm.state - .codec_registry - .decode(args.obj, encoding, args.errors, vm) + args.apply(vm, codecs::CodecsRegistry::decode) } #[pyfunction] - fn _forget_codec(encoding: PyStrRef, vm: &VirtualMachine) { + fn _forget_codec(encoding: PyUtf8StrRef, vm: &VirtualMachine) { vm.state.codec_registry.forget(encoding.as_str()); } #[pyfunction] - fn register_error(name: PyStrRef, handler: PyObjectRef, vm: &VirtualMachine) -> PyResult<()> { + fn register_error( + name: PyUtf8StrRef, + handler: PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult<()> { if !handler.is_callable() { return Err(vm.new_type_error("handler must be callable".to_owned())); } @@ -86,28 +105,18 @@ mod _codecs { } #[pyfunction] - fn lookup_error(name: PyStrRef, vm: &VirtualMachine) -> PyResult { - if name.as_wtf8().as_bytes().contains(&0) { + fn lookup_error(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { + if name.as_str().contains('\0') { return Err(cstring_error(vm)); } - if !name.as_wtf8().is_utf8() { - return Err(vm.new_unicode_encode_error( - "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), - )); - } vm.state.codec_registry.lookup_error(name.as_str(), vm) } #[pyfunction] - fn _unregister_error(errors: PyStrRef, vm: &VirtualMachine) -> PyResult { - if errors.as_wtf8().as_bytes().contains(&0) { + fn _unregister_error(errors: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { + if errors.as_str().contains('\0') { return Err(cstring_error(vm)); } - if !errors.as_wtf8().is_utf8() { - return Err(vm.new_unicode_encode_error( - "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), - )); - } vm.state .codec_registry .unregister_error(errors.as_str(), vm) @@ -120,7 +129,7 @@ mod _codecs { #[pyarg(positional)] s: PyStrRef, #[pyarg(positional, optional)] - errors: Option, + errors: Option, } impl EncodeArgs { @@ -143,7 +152,7 @@ mod _codecs { #[pyarg(positional)] data: ArgBytesLike, #[pyarg(positional, optional)] - errors: Option, + errors: Option, #[pyarg(positional, default = false)] final_decode: bool, } @@ -165,7 +174,7 @@ mod _codecs { #[pyarg(positional)] data: ArgBytesLike, #[pyarg(positional, optional)] - errors: Option, + errors: Option, } impl DecodeArgsNoFinal { @@ -366,14 +375,14 @@ fn delegate_pycodecs( #[pymodule(sub)] mod _codecs_windows { use crate::{PyResult, VirtualMachine}; - use crate::{builtins::PyStrRef, function::ArgBytesLike}; + use crate::{builtins::PyStrRef, builtins::PyUtf8StrRef, function::ArgBytesLike}; #[derive(FromArgs)] struct MbcsEncodeArgs { #[pyarg(positional)] s: PyStrRef, #[pyarg(positional, optional)] - errors: Option, + errors: Option, } #[pyfunction] @@ -461,7 +470,7 @@ mod _codecs_windows { #[pyarg(positional)] data: ArgBytesLike, #[pyarg(positional, optional)] - errors: Option, + errors: Option, #[pyarg(positional, default = false)] #[allow(dead_code)] r#final: bool, @@ -559,7 +568,7 @@ mod _codecs_windows { #[pyarg(positional)] s: PyStrRef, #[pyarg(positional, optional)] - errors: Option, + errors: Option, } #[pyfunction] @@ -647,7 +656,7 @@ mod _codecs_windows { #[pyarg(positional)] data: ArgBytesLike, #[pyarg(positional, optional)] - errors: Option, + errors: Option, #[pyarg(positional, default = false)] #[allow(dead_code)] r#final: bool, @@ -747,7 +756,7 @@ mod _codecs_windows { #[pyarg(positional)] s: PyStrRef, #[pyarg(positional, optional)] - errors: Option, + errors: Option, } fn code_page_encoding_name(code_page: u32) -> String { @@ -1074,7 +1083,7 @@ mod _codecs_windows { #[pyarg(positional)] data: ArgBytesLike, #[pyarg(positional, optional)] - errors: Option, + errors: Option, #[pyarg(positional, default = false)] r#final: bool, } diff --git a/crates/vm/src/stdlib/collections.rs b/crates/vm/src/stdlib/collections.rs index 0f84db80e74..80f80e2d28f 100644 --- a/crates/vm/src/stdlib/collections.rs +++ b/crates/vm/src/stdlib/collections.rs @@ -7,7 +7,7 @@ mod _collections { atomic_func, builtins::{ IterStatus::{Active, Exhausted}, - PositionIterInternal, PyGenericAlias, PyInt, PyType, PyTypeRef, + PositionIterInternal, PyGenericAlias, PyInt, PyStr, PyType, PyTypeRef, }, common::lock::{PyMutex, PyRwLock, PyRwLockReadGuard, PyRwLockWriteGuard}, function::{KwArgs, OptionalArg, PyComparisonValue}, @@ -559,7 +559,7 @@ mod _collections { impl Representable for PyDeque { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult> { let deque = zelf.borrow_deque().clone(); let class = zelf.class(); let class_name = class.name(); @@ -568,15 +568,24 @@ mod _collections { .map(|maxlen| format!("], maxlen={maxlen}")) .unwrap_or_else(|| "]".to_owned()); - let s = if zelf.__len__() == 0 { - format!("{class_name}([{closing_part})") - } else if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - collection_repr(Some(&class_name), "[", &closing_part, deque.iter(), vm)? + if zelf.__len__() == 0 { + return Ok(vm.ctx.new_str(format!("{class_name}([{closing_part})"))); + } + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + Ok(vm.ctx.new_str(collection_repr( + Some(&class_name), + "[", + &closing_part, + deque.iter(), + vm, + )?)) } else { - "[...]".to_owned() - }; + Ok(vm.ctx.intern_str("[...]").to_owned()) + } + } - Ok(s) + fn repr_str(_zelf: &Py, _vm: &VirtualMachine) -> PyResult { + unreachable!("repr() is overridden directly") } } diff --git a/crates/vm/src/stdlib/ctypes.rs b/crates/vm/src/stdlib/ctypes.rs index 441a5ce37e4..b358d04a6e8 100644 --- a/crates/vm/src/stdlib/ctypes.rs +++ b/crates/vm/src/stdlib/ctypes.rs @@ -78,7 +78,7 @@ impl PyType { { return Err(vm.new_exception_msg( vm.ctx.exceptions.system_error.to_owned(), - format!("class \"{}\" already initialized", self.name()), + format!("class \"{}\" already initialized", self.name()).into(), )); } Ok(()) @@ -627,7 +627,7 @@ pub(crate) mod _ctypes { #[pyfunction] fn dlsym( handle: usize, - name: crate::builtins::PyStrRef, + name: crate::builtins::PyUtf8StrRef, vm: &VirtualMachine, ) -> PyResult { let symbol_name = alloc::ffi::CString::new(name.as_str()) @@ -689,7 +689,7 @@ pub(crate) mod _ctypes { // PyUnicode_CheckExact(cls) - string creates incomplete pointer type if let Some(s) = cls.downcast_ref::() { // Incomplete pointer type: _type_ not set, cache key is id(result) - let name = format!("LP_{}", s.as_str()); + let name = format!("LP_{}", s.as_wtf8()); let new_type = metaclass .as_object() @@ -1117,7 +1117,7 @@ pub(crate) mod _ctypes { arg_values.push(ptr); arg_types.push(Type::pointer()); } else if let Some(s) = arg.downcast_ref::() { - let ptr = s.as_str().as_ptr() as isize; + let ptr = s.as_bytes().as_ptr() as isize; arg_values.push(ptr); arg_types.push(Type::pointer()); } else { diff --git a/crates/vm/src/stdlib/ctypes/array.rs b/crates/vm/src/stdlib/ctypes/array.rs index 843dd7d5b8c..0672d0cbe80 100644 --- a/crates/vm/src/stdlib/ctypes/array.rs +++ b/crates/vm/src/stdlib/ctypes/array.rs @@ -1,6 +1,8 @@ use super::StgInfo; use super::base::{CDATA_BUFFER_METHODS, PyCData}; use super::type_info; +use crate::common::lock::LazyLock; +use crate::sliceable::SaturatedSliceIter; use crate::{ AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, atomic_func, @@ -455,7 +457,6 @@ impl Initializer for PyCArray { impl AsSequence for PyCArray { fn as_sequence() -> &'static PySequenceMethods { - use crate::common::lock::LazyLock; static AS_SEQUENCE: LazyLock = LazyLock::new(|| PySequenceMethods { length: atomic_func!(|seq, _vm| { let zelf = PyCArray::sequence_downcast(seq); @@ -480,7 +481,6 @@ impl AsSequence for PyCArray { impl AsMapping for PyCArray { fn as_mapping() -> &'static PyMappingMethods { - use crate::common::lock::LazyLock; static AS_MAPPING: LazyLock = LazyLock::new(|| PyMappingMethods { length: atomic_func!(|mapping, _vm| { let zelf = PyCArray::mapping_downcast(mapping); @@ -775,7 +775,12 @@ impl PyCArray { } Some("u") => { if let Some(s) = value.downcast_ref::() { - let code = s.as_str().chars().next().map(|c| c as u32).unwrap_or(0); + let code = s + .as_wtf8() + .code_points() + .next() + .map(|c| c.to_u32()) + .unwrap_or(0); if offset + WCHAR_SIZE <= buffer.len() { wchar_to_bytes(code, &mut buffer[offset..]); } @@ -808,7 +813,7 @@ impl PyCArray { let (ptr_val, converted) = if value.is(&vm.ctx.none) { (0usize, None) } else if let Some(s) = value.downcast_ref::() { - let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); (ptr, Some(holder)) } else if let Ok(int_val) = value.try_index(vm) { (int_val.as_bigint().to_usize().unwrap_or(0), None) @@ -965,8 +970,6 @@ impl PyCArray { // Array_subscript slice handling fn getitem_by_slice(zelf: &Py, slice: &PySlice, vm: &VirtualMachine) -> PyResult { - use crate::sliceable::SaturatedSliceIter; - let stg = zelf.class().stg_info_opt(); let length = stg.as_ref().map_or(0, |i| i.length); @@ -1089,8 +1092,6 @@ impl PyCArray { value: PyObjectRef, vm: &VirtualMachine, ) -> PyResult<()> { - use crate::sliceable::SaturatedSliceIter; - let length = zelf.class().stg_info_opt().map_or(0, |i| i.length); // PySlice_Unpack + PySlice_AdjustIndices @@ -1249,15 +1250,15 @@ fn wchar_array_set_value( .ok_or_else(|| vm.new_type_error("unicode string expected"))?; let mut buffer = zelf.0.buffer.write(); let wchar_count = buffer.len() / WCHAR_SIZE; - let char_count = s.as_str().chars().count(); + let char_count = s.as_wtf8().code_points().count(); if char_count > wchar_count { return Err(vm.new_value_error("string too long")); } - for (i, ch) in s.as_str().chars().enumerate() { + for (i, ch) in s.as_wtf8().code_points().enumerate() { let offset = i * WCHAR_SIZE; - wchar_to_bytes(ch as u32, &mut buffer.to_mut()[offset..]); + wchar_to_bytes(ch.to_u32(), &mut buffer.to_mut()[offset..]); } let terminator_offset = char_count * WCHAR_SIZE; diff --git a/crates/vm/src/stdlib/ctypes/base.rs b/crates/vm/src/stdlib/ctypes/base.rs index ba2b987330a..90137f2549d 100644 --- a/crates/vm/src/stdlib/ctypes/base.rs +++ b/crates/vm/src/stdlib/ctypes/base.rs @@ -1,6 +1,9 @@ use super::array::{WCHAR_SIZE, wchar_from_bytes, wchar_to_bytes}; -use crate::builtins::{PyBytes, PyDict, PyMemoryView, PyStr, PyTuple, PyType, PyTypeRef}; +use crate::builtins::{ + PyBytes, PyDict, PyList, PyMemoryView, PyStr, PyTuple, PyType, PyTypeRef, PyUtf8Str, +}; use crate::class::StaticType; +use crate::convert::ToPyObject; use crate::function::{ArgBytesLike, OptionalArg, PySetterValue}; use crate::protocol::{BufferMethods, PyBuffer}; use crate::types::{Constructor, GetDescriptor, Representable}; @@ -16,6 +19,7 @@ use core::mem; use crossbeam_utils::atomic::AtomicCell; use num_traits::{Signed, ToPrimitive}; use rustpython_common::lock::PyRwLock; +use rustpython_common::wtf8::Wtf8; use widestring::WideChar; // StgInfo - Storage information for ctypes types @@ -363,11 +367,10 @@ pub(super) fn get_field_format( if let Ok(type_attr) = field_type.get_attr("_type_", vm) && let Some(type_str) = type_attr.downcast_ref::() { - let s = type_str.as_str(); - if !s.is_empty() { - return format!("{}{}", endian_prefix, s); - } - return s.to_string(); + let s = type_str + .to_str() + .expect("_type_ is validated as ASCII at type creation"); + return format!("{}{}", endian_prefix, s); } // Default: single byte @@ -431,10 +434,10 @@ pub(super) fn ensure_z_null_terminated( } /// Convert str to null-terminated wchar_t buffer. Returns (PyBytes holder, pointer). -pub(super) fn str_to_wchar_bytes(s: &str, vm: &VirtualMachine) -> (PyObjectRef, usize) { +pub(super) fn str_to_wchar_bytes(s: &Wtf8, vm: &VirtualMachine) -> (PyObjectRef, usize) { let wchars: Vec = s - .chars() - .map(|c| c as libc::wchar_t) + .code_points() + .map(|cp| cp.to_u32() as libc::wchar_t) .chain(core::iter::once(0)) .collect(); let ptr = wchars.as_ptr() as usize; @@ -964,9 +967,9 @@ impl PyCData { if let Some(str_val) = value.downcast_ref::() { // Convert str to wchar_t bytes (platform-dependent size) let mut wchar_bytes = Vec::with_capacity(size); - for ch in str_val.as_str().chars().take(size / WCHAR_SIZE) { + for cp in str_val.as_wtf8().code_points().take(size / WCHAR_SIZE) { let mut bytes = [0u8; 4]; - wchar_to_bytes(ch as u32, &mut bytes); + wchar_to_bytes(cp.to_u32(), &mut bytes); wchar_bytes.extend_from_slice(&bytes[..WCHAR_SIZE]); } // Pad with nulls to fill the array @@ -1299,13 +1302,13 @@ impl PyCData { .ok_or_else(|| vm.new_value_error("Library not found"))?; let inner_lib = library.lib.lock(); - let symbol_name_with_nul = format!("{}\0", name.as_str()); + let symbol_name_with_nul = format!("{}\0", name.as_wtf8()); let ptr: *const u8 = if let Some(lib) = &*inner_lib { unsafe { lib.get::<*const u8>(symbol_name_with_nul.as_bytes()) .map(|sym| *sym) .map_err(|_| { - vm.new_value_error(format!("symbol '{}' not found", name.as_str())) + vm.new_value_error(format!("symbol '{}' not found", name.as_wtf8())) })? } } else { @@ -1315,7 +1318,7 @@ impl PyCData { // dlsym can return NULL for symbols that resolve to NULL (e.g., GNU IFUNC) // Treat NULL addresses as errors if ptr.is_null() { - return Err(vm.new_value_error(format!("symbol '{}' not found", name.as_str()))); + return Err(vm.new_value_error(format!("symbol '{}' not found", name.as_wtf8()))); } // PyCData_AtAddress @@ -1706,7 +1709,7 @@ impl PyCField { "Z" => { // c_wchar_p: store pointer to null-terminated wchar_t buffer if let Some(s) = value.downcast_ref::() { - let (holder, ptr) = str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = str_to_wchar_bytes(s.as_wtf8(), vm); let mut result = vec![0u8; size]; let addr_bytes = ptr.to_ne_bytes(); let len = core::cmp::min(addr_bytes.len(), size); @@ -1759,7 +1762,7 @@ impl PyCField { if let Ok(type_code) = element_type.as_object().get_attr("_type_", vm) && let Some(s) = type_code.downcast_ref::() { - return s.as_str() == "c"; + return s.as_bytes() == b"c"; } } false @@ -1776,7 +1779,7 @@ impl PyCField { if let Ok(type_code) = element_type.as_object().get_attr("_type_", vm) && let Some(s) = type_code.downcast_ref::() { - return s.as_str() == "u"; + return s.as_bytes() == b"u"; } } false @@ -2508,9 +2511,6 @@ fn make_fields( offset: isize, vm: &VirtualMachine, ) -> PyResult<()> { - use crate::builtins::{PyList, PyTuple}; - use crate::convert::ToPyObject; - let fields = descr.proto.as_object().get_attr("_fields_", vm)?; let fieldlist: Vec = if let Some(list) = fields.downcast_ref::() { list.borrow_vec().to_vec() @@ -2532,7 +2532,7 @@ fn make_fields( let fname = field_tuple .first() .expect("len checked") - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| vm.new_type_error("field name must be a string"))?; let fdescr_obj = descr @@ -2555,7 +2555,10 @@ fn make_fields( } let new_descr = super::PyCField::new_from_field(fdescr, index, offset); - cls.set_attr(vm.ctx.intern_str(fname.as_str()), new_descr.to_pyobject(vm)); + cls.set_attr( + vm.ctx.intern_str(fname.as_wtf8()), + new_descr.to_pyobject(vm), + ); } Ok(()) @@ -2563,9 +2566,6 @@ fn make_fields( /// Process _anonymous_ attribute for struct/union pub(super) fn make_anon_fields(cls: &Py, vm: &VirtualMachine) -> PyResult<()> { - use crate::builtins::{PyList, PyTuple}; - use crate::convert::ToPyObject; - let anon = match cls.as_object().get_attr("_anonymous_", vm) { Ok(anon) => anon, Err(_) => return Ok(()), @@ -2586,18 +2586,21 @@ pub(super) fn make_anon_fields(cls: &Py, vm: &VirtualMachine) -> PyResul let descr_obj = cls .as_object() - .get_attr(vm.ctx.intern_str(fname.as_str()), vm)?; + .get_attr(vm.ctx.intern_str(fname.as_wtf8()), vm)?; let descr = descr_obj.downcast_ref::().ok_or_else(|| { vm.new_attribute_error(format!( "'{}' is specified in _anonymous_ but not in _fields_", - fname.as_str() + fname.as_wtf8() )) })?; let mut new_descr = super::PyCField::new_from_field(descr, 0, 0); new_descr.set_anonymous(true); - cls.set_attr(vm.ctx.intern_str(fname.as_str()), new_descr.to_pyobject(vm)); + cls.set_attr( + vm.ctx.intern_str(fname.as_wtf8()), + new_descr.to_pyobject(vm), + ); make_fields(cls, descr, descr.index, descr.offset, vm)?; } diff --git a/crates/vm/src/stdlib/ctypes/function.rs b/crates/vm/src/stdlib/ctypes/function.rs index d24102ed635..bf6dcfad53d 100644 --- a/crates/vm/src/stdlib/ctypes/function.rs +++ b/crates/vm/src/stdlib/ctypes/function.rs @@ -280,10 +280,10 @@ fn conv_param(value: &PyObject, vm: &VirtualMachine) -> PyResult { // 4. Python str -> wide string pointer (like PyUnicode_AsWideCharString) if let Some(s) = value.downcast_ref::() { - // Convert to null-terminated UTF-16 (wide string) + // Convert to null-terminated UTF-16, preserving lone surrogates let wide: Vec = s - .as_str() - .encode_utf16() + .as_wtf8() + .encode_wide() .chain(core::iter::once(0)) .collect(); let wide_bytes: Vec = wide.iter().flat_map(|&x| x.to_ne_bytes()).collect(); @@ -448,7 +448,7 @@ impl ReturnType for PyTypeRef { // Try to get _type_ attribute first (for ctypes types like c_void_p) if let Ok(type_attr) = self.as_object().get_attr(vm.ctx.intern_str("_type_"), vm) && let Some(s) = type_attr.downcast_ref::() - && let Some(ffi_type) = get_ffi_type(s.as_str()) + && let Some(ffi_type) = s.to_str().and_then(get_ffi_type) { return Some(ffi_type); } @@ -598,7 +598,7 @@ fn extract_ptr_from_arg(arg: &PyObject, vm: &VirtualMachine) -> PyResult } // PyStr: return internal buffer address if let Some(s) = arg.downcast_ref::() { - return Ok(s.as_str().as_ptr() as usize); + return Ok(s.as_bytes().as_ptr() as usize); } // PyBytes: return internal buffer address if let Some(bytes) = arg.downcast_ref::() { @@ -753,7 +753,9 @@ fn cast_check_pointertype(ctype: &PyObject, vm: &VirtualMachine) -> bool { if let Ok(type_attr) = ctype.get_attr("_type_", vm) && let Some(s) = type_attr.downcast_ref::() { - let c = s.as_str(); + let c = s + .to_str() + .expect("_type_ is validated as ASCII at type creation"); if c.len() == 1 && "sPzUZXO".contains(c) { return true; } @@ -790,7 +792,7 @@ pub(super) fn cast_impl( bytes.as_bytes().as_ptr() as usize } else if let Some(s) = obj.downcast_ref::() { // unicode/str → buffer address (c_void_p_from_param: PyUnicode_Check) - s.as_str().as_ptr() as usize + s.as_bytes().as_ptr() as usize } else if let Some(ptr) = obj.downcast_ref::() { // Pointer instance → contained pointer value ptr.get_ptr_value() diff --git a/crates/vm/src/stdlib/ctypes/pointer.rs b/crates/vm/src/stdlib/ctypes/pointer.rs index 6000bb57a37..a127f76ead1 100644 --- a/crates/vm/src/stdlib/ctypes/pointer.rs +++ b/crates/vm/src/stdlib/ctypes/pointer.rs @@ -639,7 +639,7 @@ impl PyCPointer { } else if type_code.as_deref() == Some("Z") && let Some(s) = value.downcast_ref::() { - let (holder, ptr_val) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr_val) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); unsafe { *(addr as *mut usize) = ptr_val; } diff --git a/crates/vm/src/stdlib/ctypes/simple.rs b/crates/vm/src/stdlib/ctypes/simple.rs index 67c07dcb73b..107d603677c 100644 --- a/crates/vm/src/stdlib/ctypes/simple.rs +++ b/crates/vm/src/stdlib/ctypes/simple.rs @@ -117,8 +117,8 @@ fn set_primitive(_type_: &str, value: &PyObject, vm: &VirtualMachine) -> PyResul } } "u" => { - if let Ok(b) = value.str(vm).map(|v| v.to_string().chars().count() == 1) { - if b { + if let Some(s) = value.downcast_ref::() { + if s.as_wtf8().code_points().count() == 1 { Ok(value.to_owned()) } else { Err(vm.new_type_error("one character unicode string expected")) @@ -331,7 +331,7 @@ impl PyCSimpleType { // c_wchar: 1 unicode character Some("u") => { if let Some(s) = value.downcast_ref::() - && s.as_str().chars().count() == 1 + && s.as_wtf8().code_points().count() == 1 { return create_simple_with_value("u", &value); } @@ -366,7 +366,7 @@ impl PyCSimpleType { Some("Z") => { // 1. str → create CArgObject with null-terminated wchar buffer if let Some(s) = value.downcast_ref::() { - let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); return Ok(CArgObject { tag: b'Z', value: FfiArgValue::OwnedPointer(ptr, holder), @@ -407,7 +407,7 @@ impl PyCSimpleType { } // 3. str → create CArgObject with null-terminated wchar buffer if let Some(s) = value.downcast_ref::() { - let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); return Ok(CArgObject { tag: b'Z', value: FfiArgValue::OwnedPointer(ptr, holder), @@ -775,15 +775,16 @@ fn value_to_bytes_endian( } "u" => { // c_wchar - platform-dependent size (2 on Windows, 4 on Unix) - if let Ok(s) = value.str(vm) - && let Some(c) = s.as_str().chars().next() - { - let mut buffer = vec![0u8; WCHAR_SIZE]; - wchar_to_bytes(c as u32, &mut buffer); - if swapped { - buffer.reverse(); + if let Some(s) = value.downcast_ref::() { + let mut cps = s.as_wtf8().code_points(); + if let (Some(c), None) = (cps.next(), cps.next()) { + let mut buffer = vec![0u8; WCHAR_SIZE]; + wchar_to_bytes(c.to_u32(), &mut buffer); + if swapped { + buffer.reverse(); + } + return buffer; } - return buffer; } vec![0; WCHAR_SIZE] } @@ -1050,7 +1051,7 @@ impl Constructor for PyCSimple { } else if _type_ == "Z" && let Some(s) = v.downcast_ref::() { - let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); let buffer = ptr.to_ne_bytes().to_vec(); let cdata = PyCData::from_bytes(buffer, Some(holder)); return PyCSimple(cdata).into_ref_with_type(vm, cls).map(Into::into); @@ -1282,7 +1283,7 @@ impl PyCSimple { } else if type_code == "Z" && let Some(s) = value.downcast_ref::() { - let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_str(), vm); + let (holder, ptr) = super::base::str_to_wchar_bytes(s.as_wtf8(), vm); *zelf.0.buffer.write() = alloc::borrow::Cow::Owned(ptr.to_ne_bytes().to_vec()); *zelf.0.objects.write() = Some(holder); return Ok(()); diff --git a/crates/vm/src/stdlib/ctypes/structure.rs b/crates/vm/src/stdlib/ctypes/structure.rs index c0116d9d76c..69d267f287e 100644 --- a/crates/vm/src/stdlib/ctypes/structure.rs +++ b/crates/vm/src/stdlib/ctypes/structure.rs @@ -1,5 +1,5 @@ use super::base::{CDATA_BUFFER_METHODS, PyCData, PyCField, StgInfo, StgInfoFlags}; -use crate::builtins::{PyList, PyStr, PyTuple, PyType, PyTypeRef}; +use crate::builtins::{PyList, PyStr, PyTuple, PyType, PyTypeRef, PyUtf8Str}; use crate::convert::ToPyObject; use crate::function::{FuncArgs, OptionalArg, PySetterValue}; use crate::protocol::{BufferDescriptor, PyBuffer, PyNumberMethods}; @@ -318,9 +318,10 @@ impl PyCStructType { let name = field_tuple .first() .expect("len checked") - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| vm.new_type_error("field name must be a string"))? - .to_string(); + .as_str() + .to_owned(); let field_type = field_tuple.get(1).expect("len checked").clone(); @@ -604,7 +605,7 @@ impl SetAttr for PyCStructType { vm: &VirtualMachine, ) -> PyResult<()> { // Check if _fields_ is being set - if attr_name.as_str() == "_fields_" { + if attr_name.as_bytes() == b"_fields_" { let pytype: &Py = zelf.to_base(); // Check finalization in separate scope to release read lock before process_fields @@ -634,7 +635,7 @@ impl SetAttr for PyCStructType { return Ok(()); } // Delegate to PyType's setattro logic for type attributes - let attr_name_interned = vm.ctx.intern_str(attr_name.as_str()); + let attr_name_interned = vm.ctx.intern_str(attr_name.as_wtf8()); let pytype: &Py = zelf.to_base(); // Check for data descriptor first @@ -654,7 +655,7 @@ impl SetAttr for PyCStructType { return Err(vm.new_attribute_error(format!( "type object '{}' has no attribute '{}'", pytype.name(), - attr_name.as_str(), + attr_name.as_wtf8(), ))); } } @@ -747,7 +748,7 @@ impl PyCStructure { } if let Some(tuple) = field.downcast_ref::() && let Some(name) = tuple.first() - && let Some(name_str) = name.downcast_ref::() + && let Some(name_str) = name.downcast_ref::() { let field_name = name_str.as_str().to_owned(); // Check for duplicate in kwargs diff --git a/crates/vm/src/stdlib/ctypes/union.rs b/crates/vm/src/stdlib/ctypes/union.rs index fba9a75e955..7526fa92eff 100644 --- a/crates/vm/src/stdlib/ctypes/union.rs +++ b/crates/vm/src/stdlib/ctypes/union.rs @@ -1,6 +1,6 @@ use super::base::{CDATA_BUFFER_METHODS, StgInfoFlags}; use super::{PyCData, PyCField, StgInfo}; -use crate::builtins::{PyList, PyStr, PyTuple, PyType, PyTypeRef}; +use crate::builtins::{PyList, PyStr, PyTuple, PyType, PyTypeRef, PyUtf8Str}; use crate::convert::ToPyObject; use crate::function::{ArgBytesLike, FuncArgs, OptionalArg, PySetterValue}; use crate::protocol::{BufferDescriptor, PyBuffer}; @@ -217,9 +217,10 @@ impl PyCUnionType { let name = field_tuple .first() .expect("len checked") - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| vm.new_type_error("field name must be a string"))? - .to_string(); + .as_str() + .to_owned(); let field_type = field_tuple.get(1).expect("len checked").clone(); @@ -486,7 +487,7 @@ impl SetAttr for PyCUnionType { vm: &VirtualMachine, ) -> PyResult<()> { let pytype: &Py = zelf.to_base(); - let attr_name_interned = vm.ctx.intern_str(attr_name.as_str()); + let attr_name_interned = vm.ctx.intern_str(attr_name.as_wtf8()); // 1. First, do PyType's setattro (PyType_Type.tp_setattro first) // Check for data descriptor first @@ -495,7 +496,7 @@ impl SetAttr for PyCUnionType { if let Some(descriptor) = descr_set { descriptor(&attr, pytype.to_owned().into(), value.clone(), vm)?; // After successful setattro, check if _fields_ and call process_fields - if attr_name.as_str() == "_fields_" + if attr_name.as_bytes() == b"_fields_" && let PySetterValue::Assign(fields_value) = value { PyCUnionType::process_fields(pytype, fields_value, vm)?; @@ -506,7 +507,7 @@ impl SetAttr for PyCUnionType { // 2. If _fields_, call process_fields (which checks FINAL internally) // Check BEFORE writing to dict to avoid storing _fields_ when FINAL - if attr_name.as_str() == "_fields_" + if attr_name.as_bytes() == b"_fields_" && let PySetterValue::Assign(ref fields_value) = value { PyCUnionType::process_fields(pytype, fields_value.clone(), vm)?; @@ -526,7 +527,7 @@ impl SetAttr for PyCUnionType { return Err(vm.new_attribute_error(format!( "type object '{}' has no attribute '{}'", pytype.name(), - attr_name.as_str(), + attr_name.as_wtf8(), ))); } } @@ -618,7 +619,7 @@ impl PyCUnion { } if let Some(tuple) = field.downcast_ref::() && let Some(name) = tuple.first() - && let Some(name_str) = name.downcast_ref::() + && let Some(name_str) = name.downcast_ref::() { let field_name = name_str.as_str().to_owned(); // Check for duplicate in kwargs diff --git a/crates/vm/src/stdlib/functools.rs b/crates/vm/src/stdlib/functools.rs index 6c5c8f2e4c5..2c3f70ab52a 100644 --- a/crates/vm/src/stdlib/functools.rs +++ b/crates/vm/src/stdlib/functools.rs @@ -14,6 +14,7 @@ mod _functools { types::{Callable, Constructor, GetDescriptor, Representable}, }; use indexmap::IndexMap; + use rustpython_common::wtf8::Wtf8Buf; #[derive(FromArgs)] struct ReduceArgs { @@ -44,7 +45,7 @@ mod _functools { let exc_type = vm.ctx.exceptions.type_error.to_owned(); vm.new_exception_msg( exc_type, - "reduce() of empty sequence with no initial value".to_owned(), + "reduce() of empty sequence with no initial value".into(), ) })? }; @@ -424,9 +425,9 @@ mod _functools { // Add keywords from self.keywords for (key, value) in &*keywords { let key_str = key - .downcast::() - .map_err(|_| vm.new_type_error("keywords must be strings"))?; - final_kwargs.insert(key_str.as_str().to_owned(), value); + .downcast_ref::() + .ok_or_else(|| vm.new_type_error("keywords must be strings"))?; + final_kwargs.insert(key_str.expect_str().to_owned(), value); } // Add keywords from args.kwargs (these override self.keywords) @@ -455,7 +456,7 @@ mod _functools { impl Representable for PyPartial { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { // Check for recursive repr let obj = zelf.as_object(); if let Some(_guard) = ReprGuard::enter(vm, obj) { @@ -469,52 +470,46 @@ mod _functools { ) }; - let func_repr = func.repr(vm)?; - let mut parts = vec![func_repr.as_str().to_owned()]; + let qualname = zelf.class().__qualname__(vm); + let qualname_wtf8 = qualname + .downcast_ref::() + .map(|s| s.as_wtf8().to_owned()) + .unwrap_or_else(|| Wtf8Buf::from(zelf.class().name().to_owned())); + let module = zelf.class().__module__(vm); + + let mut result = Wtf8Buf::new(); + if let Ok(module_str) = module.downcast::() { + let module_name = module_str.as_wtf8(); + if module_name != "builtins" && !module_name.is_empty() { + result.push_wtf8(module_name); + result.push_char('.'); + } + } + result.push_wtf8(&qualname_wtf8); + result.push_char('('); + result.push_wtf8(func.repr(vm)?.as_wtf8()); for arg in args.as_slice() { - parts.push(arg.repr(vm)?.as_str().to_owned()); + result.push_str(", "); + result.push_wtf8(arg.repr(vm)?.as_wtf8()); } for (key, value) in &*keywords { - // For string keys, use them directly without quotes - let key_part = if let Ok(s) = key.clone().downcast::() { - s.as_str().to_owned() + result.push_str(", "); + let key_str = if let Ok(s) = key.clone().downcast::() { + s } else { - // For non-string keys, convert to string using __str__ - key.str(vm)?.as_str().to_owned() + key.str(vm)? }; - let value_str = value.repr(vm)?; - parts.push(format!( - "{key_part}={value_str}", - value_str = value_str.as_str() - )); + result.push_wtf8(key_str.as_wtf8()); + result.push_char('='); + result.push_wtf8(value.repr(vm)?.as_wtf8()); } - let qualname = zelf.class().__qualname__(vm); - let qualname_str = qualname - .downcast::() - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| zelf.class().name().to_owned()); - let module = zelf.class().__module__(vm); - - let qualified_name = match module.downcast::() { - Ok(module_str) => { - let module_name = module_str.as_str(); - match module_name { - "builtins" | "" => qualname_str, - _ => format!("{module_name}.{qualname_str}"), - } - } - Err(_) => qualname_str, - }; - - Ok(format!( - "{qualified_name}({parts})", - parts = parts.join(", ") - )) + result.push_char(')'); + Ok(result) } else { - Ok("...".to_owned()) + Ok(Wtf8Buf::from("...")) } } } diff --git a/crates/vm/src/stdlib/imp.rs b/crates/vm/src/stdlib/imp.rs index cf9aba02265..fefcd383f58 100644 --- a/crates/vm/src/stdlib/imp.rs +++ b/crates/vm/src/stdlib/imp.rs @@ -71,7 +71,7 @@ impl FrozenError { Excluded => format!("Excluded frozen object named {mod_name}"), Invalid => format!("Frozen object named {mod_name} is invalid"), }; - vm.new_import_error(msg, vm.ctx.new_str(mod_name)) + vm.new_import_error(msg, vm.ctx.new_utf8_str(mod_name)) } } @@ -106,7 +106,7 @@ fn find_frozen(name: &str, vm: &VirtualMachine) -> Result bool { + fn is_builtin(name: PyUtf8StrRef, vm: &VirtualMachine) -> bool { vm.state.module_defs.contains_key(name.as_str()) } #[pyfunction] - fn is_frozen(name: PyStrRef, vm: &VirtualMachine) -> bool { + fn is_frozen(name: PyUtf8StrRef, vm: &VirtualMachine) -> bool { super::find_frozen(name.as_str(), vm).is_ok() } #[pyfunction] fn create_builtin(spec: PyObjectRef, vm: &VirtualMachine) -> PyResult { let sys_modules = vm.sys_module.get_attr("modules", vm).unwrap(); - let name: PyStrRef = spec.get_attr("name", vm)?.try_into_value(vm)?; + let name: PyUtf8StrRef = spec.get_attr("name", vm)?.try_into_value(vm)?; // Check sys.modules first if let Ok(module) = sys_modules.get_item(&*name, vm) { return Ok(module); } - // Try multi-phase init modules first (they need special handling) - if let Some(&def) = vm.state.module_defs.get(name.as_str()) { + let name_str = name.as_str(); + if let Some(&def) = vm.state.module_defs.get(name_str) { // Phase 1: Create module (use create slot if provided, else default creation) let module = if let Some(create) = def.slots.create { // Custom module creation @@ -163,7 +163,7 @@ mod _imp { module.__init_methods(vm)?; // Add to sys.modules BEFORE exec (critical for circular import handling) - sys_modules.set_item(&*name, module.clone().into(), vm)?; + sys_modules.set_item(name.as_pystr(), module.clone().into(), vm)?; // Phase 2: Call exec slot (can safely import other modules now) if let Some(exec) = def.slots.exec { @@ -184,7 +184,7 @@ mod _imp { #[pyfunction] fn get_frozen_object( - name: PyStrRef, + name: PyUtf8StrRef, data: OptionalArg, vm: &VirtualMachine, ) -> PyResult> { @@ -198,7 +198,7 @@ mod _imp { let invalid_err = || { vm.new_import_error( format!("Frozen object named '{}' is invalid", name.as_str()), - name.clone(), + name.clone().into_wtf8(), ) }; let bag = crate::builtins::code::PyObjBag(&vm.ctx); @@ -211,15 +211,16 @@ mod _imp { } #[pyfunction] - fn init_frozen(name: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn init_frozen(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { import::import_frozen(vm, name.as_str()) } #[pyfunction] - fn is_frozen_package(name: PyStrRef, vm: &VirtualMachine) -> PyResult { - super::find_frozen(name.as_str(), vm) + fn is_frozen_package(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { + let name_str = name.as_str(); + super::find_frozen(name_str, vm) .map(|frozen| frozen.package) - .map_err(|e| e.to_pyexception(name.as_str(), vm)) + .map_err(|e| e.to_pyexception(name_str, vm)) } #[pyfunction] @@ -230,7 +231,7 @@ mod _imp { #[pyfunction] fn _fix_co_filename(code: PyRef, path: PyStrRef, vm: &VirtualMachine) { let old_name = code.source_path(); - let new_name = vm.ctx.intern_str(path.as_str()); + let new_name = vm.ctx.intern_str(path.as_wtf8()); super::update_code_filenames(&code, old_name, new_name); } @@ -240,7 +241,7 @@ mod _imp { .state .frozen .keys() - .map(|&name| vm.ctx.new_str(name).into()) + .map(|&name| vm.ctx.new_utf8_str(name).into()) .collect(); Ok(names) } @@ -248,7 +249,7 @@ mod _imp { #[allow(clippy::type_complexity)] #[pyfunction] fn find_frozen( - name: PyStrRef, + name: PyUtf8StrRef, withdata: OptionalArg, vm: &VirtualMachine, ) -> PyResult>, bool, Option)>> { @@ -259,19 +260,20 @@ mod _imp { unimplemented!(); } - let info = match super::find_frozen(name.as_str(), vm) { + let name_str = name.as_str(); + let info = match super::find_frozen(name_str, vm) { Ok(info) => info, Err(NotFound | Disabled | BadName) => return Ok(None), - Err(e) => return Err(e.to_pyexception(name.as_str(), vm)), + Err(e) => return Err(e.to_pyexception(name_str, vm)), }; // When origname is empty (e.g. __hello_only__), return None. // Otherwise return the resolved alias name. - let origname_str = super::resolve_frozen_alias(name.as_str()); + let origname_str = super::resolve_frozen_alias(name_str); let origname = if origname_str.is_empty() { None } else { - Some(vm.ctx.new_str(origname_str)) + Some(vm.ctx.new_utf8_str(origname_str).into()) }; Ok(Some((None, info.package, origname))) } diff --git a/crates/vm/src/stdlib/io.rs b/crates/vm/src/stdlib/io.rs index 64c02ee6ae4..93fe6339034 100644 --- a/crates/vm/src/stdlib/io.rs +++ b/crates/vm/src/stdlib/io.rs @@ -1575,10 +1575,11 @@ mod _io { args.bind(vm).map_err(|e| { let str_repr = e .__str__(vm) - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| "".to_owned()); + .as_ref() + .map_or("".as_ref(), |s| s.as_wtf8()) + .to_owned(); let msg = format!("{}() {}", Self::CLASS_NAME, str_repr); - vm.new_exception_msg(e.class().to_owned(), msg) + vm.new_exception_msg(e.class().to_owned(), msg.into()) })?; zelf.init(raw, BufferSize { buffer_size }, vm) } @@ -2273,7 +2274,7 @@ mod _io { #[pyarg(any, default)] encoding: Option, #[pyarg(any, default)] - errors: Option, + errors: Option, #[pyarg(any, default)] newline: OptionalOption, #[pyarg(any, default)] @@ -2457,7 +2458,7 @@ mod _io { encoder: Option<(PyObjectRef, Option)>, decoder: Option, encoding: PyUtf8StrRef, - errors: PyStrRef, + errors: PyUtf8StrRef, newline: Newlines, line_buffering: bool, write_through: bool, @@ -2727,17 +2728,14 @@ mod _io { let encoding = Self::resolve_encoding(args.encoding, vm)?; - let errors = args - .errors - .unwrap_or_else(|| identifier!(vm, strict).to_owned()); + let errors = args.errors.unwrap_or_else(|| vm.ctx.new_utf8_str("strict")); Self::validate_errors(&errors, vm)?; let has_read1 = vm.get_attribute_opt(buffer.clone(), "read1")?.is_some(); let seekable = vm.call_method(&buffer, "seekable", ())?.try_to_bool(vm)?; let newline = match args.newline { - OptionalArg::Missing => Newlines::default(), - OptionalArg::Present(None) => Newlines::default(), + OptionalArg::Missing | OptionalArg::Present(None) => Newlines::default(), OptionalArg::Present(Some(newline)) => newline, }; let (encoder, decoder) = @@ -2808,15 +2806,10 @@ mod _io { .map_err(|_| vm.new_value_error("I/O operation on uninitialized object")) } - fn validate_errors(errors: &PyStrRef, vm: &VirtualMachine) -> PyResult<()> { - if errors.as_wtf8().as_bytes().contains(&0) { + fn validate_errors(errors: &PyRef, vm: &VirtualMachine) -> PyResult<()> { + if errors.as_str().contains('\0') { return Err(cstring_error(vm)); } - if !errors.as_wtf8().is_utf8() { - return Err(vm.new_unicode_encode_error( - "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), - )); - } vm.state .codec_registry .lookup_error(errors.as_str(), vm) @@ -2910,7 +2903,7 @@ mod _io { fn find_coder( buffer: &PyObject, encoding: &str, - errors: &Py, + errors: &Py, newline: Newlines, vm: &VirtualMachine, ) -> PyResult<( @@ -2923,10 +2916,11 @@ mod _io { "'{encoding}' is not a text encoding; use codecs.open() to handle arbitrary codecs" ))); } + let errors = errors.to_owned().into_wtf8(); let encoder = if vm.call_method(buffer, "writable", ())?.try_to_bool(vm)? { let incremental_encoder = - match codec.get_incremental_encoder(Some(errors.to_owned()), vm) { + match codec.get_incremental_encoder(Some(errors.clone()), vm) { Ok(encoder) => encoder, Err(err) if err.fast_isinstance(vm.ctx.exceptions.type_error) @@ -2937,7 +2931,7 @@ mod _io { .and_then(|obj| obj.downcast::().ok()); StatelessIncrementalEncoder { encode: codec.get_encode_func().to_owned(), - errors: Some(errors.to_owned()), + errors: Some(errors.clone()), name, } .into_ref(&vm.ctx) @@ -2948,7 +2942,7 @@ mod _io { let encoding_name = vm.get_attribute_opt(incremental_encoder.clone(), "name")?; let encode_func = encoding_name.and_then(|name| { let name = name.downcast_ref::()?; - match name.as_str() { + match name.to_str()? { "utf-8" => Some(textio_encode_utf8 as EncodeFunc), _ => None, } @@ -2959,7 +2953,7 @@ mod _io { }; let decoder = if vm.call_method(buffer, "readable", ())?.try_to_bool(vm)? { - let decoder = match codec.get_incremental_decoder(Some(errors.to_owned()), vm) { + let decoder = match codec.get_incremental_decoder(Some(errors.clone()), vm) { Ok(decoder) => decoder, Err(err) if err.fast_isinstance(vm.ctx.exceptions.type_error) @@ -2967,7 +2961,7 @@ mod _io { { StatelessIncrementalDecoder { decode: codec.get_decode_func().to_owned(), - errors: Some(errors.to_owned()), + errors: Some(errors), } .into_ref(&vm.ctx) .into() @@ -3040,7 +3034,7 @@ mod _io { errors_changed = errs.as_str() != errors.as_str(); errors = errs; } else if encoding_changed { - errors = identifier!(vm, strict).to_owned(); + errors = identifier_utf8!(vm, strict).to_owned(); errors_changed = true; } @@ -3433,7 +3427,7 @@ mod _io { } #[pygetset] - fn errors(&self, vm: &VirtualMachine) -> PyResult { + fn errors(&self, vm: &VirtualMachine) -> PyResult { Ok(self.lock(vm)?.errors.clone()) } @@ -3995,7 +3989,7 @@ mod _io { impl Representable for TextIOWrapper { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult> { let type_name = zelf.class().slot_name(); let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) else { return Err( @@ -4004,19 +3998,19 @@ mod _io { }; let Some(data) = zelf.data.lock() else { // Reentrant call - return Ok(format!("<{type_name}>")); + return Ok(vm.ctx.new_str(Wtf8Buf::from(format!("<{type_name}>")))); }; let Some(data) = data.as_ref() else { return Err(vm.new_value_error("I/O operation on uninitialized object".to_owned())); }; - let mut result = format!("<{type_name}"); + let mut result = Wtf8Buf::from(format!("<{type_name}")); // Add name if present if let Ok(Some(name)) = vm.get_attribute_opt(data.buffer.clone(), "name") { let name_repr = name.repr(vm)?; - result.push_str(" name="); - result.push_str(name_repr.as_str()); + result.push_wtf8(" name=".as_ref()); + result.push_wtf8(name_repr.as_wtf8()); } // Add mode if present (prefer the wrapper's attribute) @@ -4029,17 +4023,20 @@ mod _io { }; if let Some(mode) = mode_obj { let mode_repr = mode.repr(vm)?; - result.push_str(" mode="); - result.push_str(mode_repr.as_str()); + result.push_wtf8(" mode=".as_ref()); + result.push_wtf8(mode_repr.as_wtf8()); } - // Add encoding - result.push_str(" encoding='"); - result.push_str(data.encoding.as_str()); - result.push('\''); + // Add encoding (always valid UTF-8) + result.push_wtf8(" encoding='".as_ref()); + result.push_wtf8(data.encoding.as_str().as_ref()); + result.push_wtf8("'>".as_ref()); - result.push('>'); - Ok(result) + Ok(vm.ctx.new_str(result)) + } + + fn repr_str(_zelf: &Py, _vm: &VirtualMachine) -> PyResult { + unreachable!("repr() is overridden directly") } } @@ -4240,7 +4237,7 @@ mod _io { output.to_mut().insert(0, '\r'.into()); self.pendingcr = false; } - if !final_ && let Some(s) = output.strip_suffix("\r".as_ref()) { + if !final_ && let Some(s) = output.strip_suffix("\r") { output = Cow::Owned(s.to_owned()); self.pendingcr = true; } @@ -4935,9 +4932,9 @@ mod _io { #[pyarg(any, default)] pub encoding: Option, #[pyarg(any, default)] - pub errors: Option, + pub errors: Option, #[pyarg(any, default)] - pub newline: Option, + pub newline: Option, #[pyarg(any, default = true)] pub closefd: bool, #[pyarg(any, default)] @@ -5237,7 +5234,7 @@ mod fileio { AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, builtins::{PyBaseExceptionRef, PyUtf8Str, PyUtf8StrRef}, - common::crt_fd, + common::{crt_fd, wtf8::Wtf8Buf}, convert::{IntoPyException, ToPyException}, exceptions::OSErrorBuilder, function::{ArgBytesLike, ArgMemoryBuffer, OptionalArg, OptionalOption}, @@ -5864,8 +5861,8 @@ mod fileio { if zelf.fd.load() >= 0 && zelf.closefd.load() { let repr = source .repr(vm) - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| "".to_owned()); + .map(|s| s.as_wtf8().to_owned()) + .unwrap_or_else(|_| Wtf8Buf::from("")); if let Err(e) = crate::stdlib::warnings::warn( vm.ctx.exceptions.resource_warning, format!("unclosed file {repr}"), @@ -5902,7 +5899,7 @@ mod winconsoleio { use crate::{ AsObject, Py, PyObject, PyObjectRef, PyRef, PyResult, TryFromObject, VirtualMachine, builtins::{PyBaseExceptionRef, PyUtf8StrRef}, - common::lock::PyMutex, + common::{lock::PyMutex, wtf8::Wtf8Buf}, convert::{IntoPyException, ToPyException}, function::{ArgBytesLike, ArgMemoryBuffer, OptionalArg}, types::{Constructor, DefaultConstructor, Destructor, Initializer, Representable}, @@ -6187,8 +6184,8 @@ mod winconsoleio { let name_str = nameobj.str(vm)?; let wide: Vec = name_str - .as_str() - .encode_utf16() + .as_wtf8() + .encode_wide() .chain(core::iter::once(0)) .collect(); @@ -6417,8 +6414,8 @@ mod winconsoleio { if zelf.fd.load() >= 0 && zelf.closefd.load() { let repr = source .repr(vm) - .map(|s| s.as_str().to_owned()) - .unwrap_or_else(|_| "".to_owned()); + .map(|s| s.as_wtf8().to_owned()) + .unwrap_or_else(|_| Wtf8Buf::from("")); if let Err(e) = crate::stdlib::warnings::warn( vm.ctx.exceptions.resource_warning, format!("unclosed file {repr}"), diff --git a/crates/vm/src/stdlib/itertools.rs b/crates/vm/src/stdlib/itertools.rs index d1c188b8892..2140f44e56e 100644 --- a/crates/vm/src/stdlib/itertools.rs +++ b/crates/vm/src/stdlib/itertools.rs @@ -20,6 +20,7 @@ mod decl { use crossbeam_utils::atomic::AtomicCell; use malachite_bigint::BigInt; use num_traits::One; + use rustpython_common::wtf8::Wtf8Buf; use alloc::fmt; use num_traits::{Signed, ToPrimitive}; @@ -220,13 +221,17 @@ mod decl { impl Representable for PyItertoolsCount { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let cur = format!("{}", zelf.cur.read().clone().repr(vm)?); + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let cur_repr = zelf.cur.read().clone().repr(vm)?; let step = &zelf.step; - if vm.bool_eq(step, vm.ctx.new_int(1).as_object())? { - return Ok(format!("count({cur})")); + let mut result = Wtf8Buf::from("count("); + result.push_wtf8(cur_repr.as_wtf8()); + if !vm.bool_eq(step, vm.ctx.new_int(1).as_object())? { + result.push_str(", "); + result.push_wtf8(step.repr(vm)?.as_wtf8()); } - Ok(format!("count({}, {})", cur, step.repr(vm)?)) + result.push_char(')'); + Ok(result) } } @@ -345,13 +350,15 @@ mod decl { impl Representable for PyItertoolsRepeat { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let mut fmt = format!("{}", &zelf.object.repr(vm)?); + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut result = Wtf8Buf::from("repeat("); + result.push_wtf8(zelf.object.repr(vm)?.as_wtf8()); if let Some(ref times) = zelf.times { - fmt.push_str(", "); - fmt.push_str(×.read().to_string()); + result.push_str(", "); + result.push_str(×.read().to_string()); } - Ok(format!("repeat({fmt})")) + result.push_char(')'); + Ok(result) } } diff --git a/crates/vm/src/stdlib/marshal.rs b/crates/vm/src/stdlib/marshal.rs index cf7abe65194..412d71f49e2 100644 --- a/crates/vm/src/stdlib/marshal.rs +++ b/crates/vm/src/stdlib/marshal.rs @@ -226,7 +226,7 @@ mod decl { marshal::deserialize_value(&mut &buf[..], PyMarshalBag(vm)).map_err(|e| match e { marshal::MarshalError::Eof => vm.new_exception_msg( vm.ctx.exceptions.eof_error.to_owned(), - "marshal data too short".to_owned(), + "marshal data too short".into(), ), marshal::MarshalError::InvalidBytecode => { vm.new_value_error("Couldn't deserialize python bytecode") diff --git a/crates/vm/src/stdlib/msvcrt.rs b/crates/vm/src/stdlib/msvcrt.rs index cf194a3f7ba..93364ea3596 100644 --- a/crates/vm/src/stdlib/msvcrt.rs +++ b/crates/vm/src/stdlib/msvcrt.rs @@ -82,7 +82,7 @@ mod msvcrt { #[pyfunction] fn putwch(s: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { let c = s - .as_str() + .expect_str() .chars() .exactly_one() .map_err(|_| vm.new_type_error("putch() argument must be a string of length 1"))?; @@ -107,7 +107,7 @@ mod msvcrt { #[pyfunction] fn ungetwch(s: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { let c = - s.as_str().chars().exactly_one().map_err(|_| { + s.expect_str().chars().exactly_one().map_err(|_| { vm.new_type_error("ungetwch() argument must be a string of length 1") })?; let ret = unsafe { suppress_iph!(_ungetwch(c as u32)) }; diff --git a/crates/vm/src/stdlib/nt.rs b/crates/vm/src/stdlib/nt.rs index 5b2cf3b92f5..f9fad0a2033 100644 --- a/crates/vm/src/stdlib/nt.rs +++ b/crates/vm/src/stdlib/nt.rs @@ -7,7 +7,9 @@ pub use module::raw_set_handle_inheritable; pub(crate) mod module { use crate::{ Py, PyResult, TryFromObject, VirtualMachine, - builtins::{PyBaseExceptionRef, PyDictRef, PyListRef, PyStrRef, PyTupleRef}, + builtins::{ + PyBaseExceptionRef, PyBytes, PyDictRef, PyListRef, PyStr, PyStrRef, PyTupleRef, + }, common::{crt_fd, suppress_iph, windows::ToWideString}, convert::ToPyException, exceptions::OSErrorBuilder, @@ -15,9 +17,9 @@ pub(crate) mod module { ospath::{OsPath, OsPathOrFd}, stdlib::os::{_os, DirFd, SupportFunc, TargetIsDirectory}, }; - use core::mem::MaybeUninit; use libc::intptr_t; + use rustpython_common::wtf8::Wtf8Buf; use std::os::windows::io::AsRawHandle; use std::{env, io, os::windows::ffi::OsStringExt}; use windows_sys::Win32::{ @@ -1188,7 +1190,7 @@ pub(crate) mod module { let argv = vm.extract_elements_with(argv.as_ref(), |obj| { let arg = PyStrRef::try_from_object(vm, obj)?; - make_widestring(arg.as_str()) + make_widestring(arg.expect_str()) })?; let first = argv @@ -1229,7 +1231,7 @@ pub(crate) mod module { let argv = vm.extract_elements_with(argv.as_ref(), |obj| { let arg = PyStrRef::try_from_object(vm, obj)?; - make_widestring(arg.as_str()) + make_widestring(arg.expect_str()) })?; let first = argv @@ -1252,8 +1254,8 @@ pub(crate) mod module { for (key, value) in env.into_iter() { let key = PyStrRef::try_from_object(vm, key)?; let value = PyStrRef::try_from_object(vm, value)?; - let key_str = key.as_str(); - let value_str = value.as_str(); + let key_str = key.expect_str(); + let value_str = value.expect_str(); // Validate: no null characters in key or value if key_str.contains('\0') || value_str.contains('\0') { @@ -1463,9 +1465,6 @@ pub(crate) mod module { #[pyfunction] fn _path_splitroot_ex(path: crate::PyObjectRef, vm: &VirtualMachine) -> PyResult { - use crate::builtins::{PyBytes, PyStr}; - use rustpython_common::wtf8::Wtf8Buf; - // Handle path-like objects via os.fspath, but without null check (non_strict=True) let path = if let Some(fspath) = vm.get_method(path.clone(), identifier!(vm, __fspath__)) { fspath?.call((), vm)? @@ -1487,7 +1486,8 @@ pub(crate) mod module { "'utf-8' codec can't decode byte {:#x} in position {}: invalid start byte", b.as_bytes().get(e.valid_up_to()).copied().unwrap_or(0), e.valid_up_to() - ), + ) + .into(), ) })?; let wide: Vec = s.encode_utf16().collect(); @@ -1535,15 +1535,7 @@ pub(crate) mod module { } #[pyfunction] - fn _path_splitroot( - path: OsPath, - _vm: &VirtualMachine, - ) -> ( - rustpython_common::wtf8::Wtf8Buf, - rustpython_common::wtf8::Wtf8Buf, - ) { - use rustpython_common::wtf8::Wtf8Buf; - + fn _path_splitroot(path: OsPath, _vm: &VirtualMachine) -> (Wtf8Buf, Wtf8Buf) { let orig: Vec<_> = path.path.to_wide(); if orig.is_empty() { return (Wtf8Buf::new(), Wtf8Buf::new()); @@ -1706,9 +1698,6 @@ pub(crate) mod module { #[pyfunction] fn _path_normpath(path: crate::PyObjectRef, vm: &VirtualMachine) -> PyResult { - use crate::builtins::{PyBytes, PyStr}; - use rustpython_common::wtf8::Wtf8Buf; - // Handle path-like objects via os.fspath let path = if let Some(fspath) = vm.get_method(path.clone(), identifier!(vm, __fspath__)) { fspath?.call((), vm)? @@ -1727,7 +1716,8 @@ pub(crate) mod module { "'utf-8' codec can't decode byte {:#x} in position {}: invalid start byte", b.as_bytes().get(e.valid_up_to()).copied().unwrap_or(0), e.valid_up_to() - ), + ) + .into(), ) })?; let wide: Vec = s.encode_utf16().collect(); diff --git a/crates/vm/src/stdlib/operator.rs b/crates/vm/src/stdlib/operator.rs index fb0d652361e..38b0f715d31 100644 --- a/crates/vm/src/stdlib/operator.rs +++ b/crates/vm/src/stdlib/operator.rs @@ -4,8 +4,8 @@ pub(crate) use _operator::module_def; mod _operator { use crate::{ AsObject, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, - builtins::{PyInt, PyIntRef, PyStr, PyStrRef, PyTupleRef, PyType, PyTypeRef}, - common::wtf8::Wtf8, + builtins::{PyInt, PyIntRef, PyStr, PyStrRef, PyTupleRef, PyType, PyTypeRef, PyUtf8StrRef}, + common::wtf8::{Wtf8, Wtf8Buf}, function::{ArgBytesLike, Either, FuncArgs, KwArgs, OptionalArg}, protocol::PyIter, recursion::ReprGuard, @@ -432,17 +432,22 @@ mod _operator { impl Representable for PyAttrGetter { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let fmt = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - let mut parts = Vec::with_capacity(zelf.attrs.len()); + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut result = Wtf8Buf::from("operator.attrgetter("); + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + let mut first = true; for part in &zelf.attrs { - parts.push(part.as_object().repr(vm)?.as_str().to_owned()); + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(part.as_object().repr(vm)?.as_wtf8()); } - parts.join(", ") } else { - "...".to_owned() - }; - Ok(format!("operator.attrgetter({fmt})")) + result.push_str("..."); + } + result.push_char(')'); + Ok(result) } } @@ -505,17 +510,22 @@ mod _operator { impl Representable for PyItemGetter { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let fmt = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { - let mut items = Vec::with_capacity(zelf.items.len()); + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut result = Wtf8Buf::from("operator.itemgetter("); + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + let mut first = true; for item in &zelf.items { - items.push(item.repr(vm)?.as_str().to_owned()); + if !first { + result.push_str(", "); + } + first = false; + result.push_wtf8(item.repr(vm)?.as_wtf8()); } - items.join(", ") } else { - "...".to_owned() - }; - Ok(format!("operator.itemgetter({fmt})")) + result.push_str("..."); + } + result.push_char(')'); + Ok(result) } } @@ -530,7 +540,7 @@ mod _operator { #[pyclass(name = "methodcaller")] #[derive(Debug, PyPayload)] struct PyMethodCaller { - name: PyStrRef, + name: PyUtf8StrRef, args: FuncArgs, } @@ -570,11 +580,10 @@ mod _operator { (name, args): Self::Args, vm: &VirtualMachine, ) -> PyResult { - if let Ok(name) = name.try_into_value(vm) { - Ok(Self { name, args }) - } else { - Err(vm.new_type_error("method name must be a string")) - } + let name = name + .try_into_value(vm) + .map_err(|_| vm.new_type_error("method name must be a string"))?; + Ok(Self { name, args }) } } @@ -589,32 +598,27 @@ mod _operator { impl Representable for PyMethodCaller { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let fmt = if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut result = Wtf8Buf::from("operator.methodcaller("); + if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { let args = &zelf.args.args; let kwargs = &zelf.args.kwargs; - let mut fmt = vec![zelf.name.as_object().repr(vm)?.as_str().to_owned()]; - if !args.is_empty() { - let mut parts = Vec::with_capacity(args.len()); - for v in args { - parts.push(v.repr(vm)?.as_str().to_owned()); - } - fmt.push(parts.join(", ")); + result.push_wtf8(zelf.name.as_object().repr(vm)?.as_wtf8()); + for v in args { + result.push_str(", "); + result.push_wtf8(v.repr(vm)?.as_wtf8()); } - // build name=value pairs from KwArgs. - if !kwargs.is_empty() { - let mut parts = Vec::with_capacity(kwargs.len()); - for (key, value) in kwargs { - let value_repr = value.repr(vm)?; - parts.push(format!("{key}={value_repr}")); - } - fmt.push(parts.join(", ")); + for (key, value) in kwargs { + result.push_str(", "); + result.push_str(key); + result.push_char('='); + result.push_wtf8(value.repr(vm)?.as_wtf8()); } - fmt.join(", ") } else { - "...".to_owned() - }; - Ok(format!("operator.methodcaller({fmt})")) + result.push_str("..."); + } + result.push_char(')'); + Ok(result) } } } diff --git a/crates/vm/src/stdlib/os.rs b/crates/vm/src/stdlib/os.rs index d67e97874f4..fd03be9175e 100644 --- a/crates/vm/src/stdlib/os.rs +++ b/crates/vm/src/stdlib/os.rs @@ -179,6 +179,7 @@ pub(super) mod _os { use core::time::Duration; use crossbeam_utils::atomic::AtomicCell; use itertools::Itertools; + use rustpython_common::wtf8::Wtf8Buf; use std::{env, fs, fs::OpenOptions, io, path::PathBuf, time::SystemTime}; const OPEN_DIR_FD: bool = cfg!(not(any(windows, target_os = "redox"))); @@ -357,7 +358,8 @@ pub(super) mod _os { #[pyfunction] fn mkdirs(path: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { - fs::create_dir_all(path.as_str()).map_err(|err| err.into_pyexception(vm)) + let os_path = vm.fsencode(&path)?; + fs::create_dir_all(&*os_path).map_err(|err| err.into_pyexception(vm)) } #[cfg(not(windows))] @@ -491,8 +493,8 @@ pub(super) mod _os { #[cfg(windows)] #[pyfunction] fn putenv(key: PyStrRef, value: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { - let key_str = key.as_str(); - let value_str = value.as_str(); + let key_str = key.expect_str(); + let value_str = value.expect_str(); // Search from index 1 because on Windows starting '=' is allowed for // defining hidden environment variables. if key_str.is_empty() @@ -539,7 +541,7 @@ pub(super) mod _os { #[cfg(windows)] #[pyfunction] fn unsetenv(key: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { - let key_str = key.as_str(); + let key_str = key.expect_str(); // Search from index 1 because on Windows starting '=' is allowed for // defining hidden environment variables. if key_str.is_empty() @@ -868,7 +870,7 @@ pub(super) mod _os { impl Representable for DirEntry { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let name = match zelf.as_object().get_attr("name", vm) { Ok(name) => Some(name), Err(e) @@ -882,7 +884,10 @@ pub(super) mod _os { if let Some(name) = name { if let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) { let repr = name.repr(vm)?; - Ok(format!("<{} {}>", zelf.class(), repr)) + let mut result = Wtf8Buf::from(format!("<{} ", zelf.class())); + result.push_wtf8(repr.as_wtf8()); + result.push_char('>'); + Ok(result) } else { Err(vm.new_runtime_error(format!( "reentrant call inside {}.__repr__", @@ -890,7 +895,7 @@ pub(super) mod _os { ))) } } else { - Ok(format!("<{}>", zelf.class())) + Ok(Wtf8Buf::from(format!("<{}>", zelf.class()))) } } } diff --git a/crates/vm/src/stdlib/posix.rs b/crates/vm/src/stdlib/posix.rs index 35ab3d48f97..961f0c6fe0d 100644 --- a/crates/vm/src/stdlib/posix.rs +++ b/crates/vm/src/stdlib/posix.rs @@ -28,7 +28,7 @@ pub fn set_inheritable(fd: BorrowedFd<'_>, inheritable: bool) -> nix::Result<()> pub mod module { use crate::{ AsObject, Py, PyObjectRef, PyResult, VirtualMachine, - builtins::{PyDictRef, PyInt, PyListRef, PyStr, PyTupleRef}, + builtins::{PyDictRef, PyInt, PyListRef, PyTupleRef, PyUtf8Str}, convert::{IntoPyException, ToPyObject, TryFromObject}, exceptions::OSErrorBuilder, function::{ArgMapping, Either, KwArgs, OptionalArg}, @@ -44,18 +44,22 @@ pub mod module { target_os = "linux", target_os = "openbsd" ))] - use crate::{builtins::PyStrRef, utils::ToCString}; + use crate::{builtins::PyUtf8StrRef, utils::ToCString}; use alloc::ffi::CString; use bitflags::bitflags; use core::ffi::CStr; use nix::{ + errno::Errno, fcntl, + sys::signal, unistd::{self, Gid, Pid, Uid}, }; + use rustpython_common::os::ffi::OsStringExt; use std::{ env, fs, io, os::fd::{AsFd, BorrowedFd, FromRawFd, IntoRawFd, OwnedFd}, }; + use strum::IntoEnumIterator; use strum_macros::{EnumIter, EnumString}; #[cfg(any(target_os = "android", target_os = "linux"))] @@ -379,7 +383,7 @@ pub mod module { fn getgroups_impl() -> nix::Result> { use core::ptr; use libc::{c_int, gid_t}; - use nix::errno::Errno; + let ret = unsafe { libc::getgroups(0, ptr::null_mut()) }; let mut groups = Vec::::with_capacity(Errno::result(ret)? as usize); let ret = unsafe { @@ -449,8 +453,6 @@ pub mod module { #[pyattr] fn environ(vm: &VirtualMachine) -> PyDictRef { - use rustpython_common::os::ffi::OsStringExt; - let environ = vm.ctx.new_dict(); for (key, value) in env::vars_os() { let key: PyObjectRef = vm.ctx.new_bytes(key.into_vec()).into(); @@ -463,8 +465,6 @@ pub mod module { #[pyfunction] fn _create_environ(vm: &VirtualMachine) -> PyDictRef { - use rustpython_common::os::ffi::OsStringExt; - let environ = vm.ctx.new_dict(); for (key, value) in env::vars_os() { let key: PyObjectRef = vm.ctx.new_bytes(key.into_vec()).into(); @@ -909,7 +909,6 @@ pub mod module { #[cfg(not(target_os = "redox"))] #[pyfunction] fn nice(increment: i32, vm: &VirtualMachine) -> PyResult { - use nix::errno::Errno; Errno::clear(); let res = unsafe { libc::nice(increment) }; if res == -1 && Errno::last_raw() != 0 { @@ -1428,7 +1427,7 @@ pub mod module { target_os = "openbsd" ))] #[pyfunction] - fn initgroups(user_name: PyStrRef, gid: Gid, vm: &VirtualMachine) -> PyResult<()> { + fn initgroups(user_name: PyUtf8StrRef, gid: Gid, vm: &VirtualMachine) -> PyResult<()> { let user = user_name.to_cstring(vm)?; unistd::initgroups(&user, gid).map_err(|err| err.into_pyexception(vm)) } @@ -1584,7 +1583,6 @@ pub mod module { let mut flags = nix::spawn::PosixSpawnFlags::empty(); if let Some(sigs) = self.setsigdef { - use nix::sys::signal; let mut set = signal::SigSet::empty(); for sig in sigs.iter(vm)? { let sig = sig?; @@ -1630,7 +1628,6 @@ pub mod module { } if let Some(sigs) = self.setsigmask { - use nix::sys::signal; let mut set = signal::SigSet::empty(); for sig in sigs.iter(vm)? { let sig = sig?; @@ -1669,7 +1666,7 @@ pub mod module { envp_from_dict(env_dict, vm)? } else { // env=None means use the current environment - use rustpython_common::os::ffi::OsStringExt; + env::vars_os() .map(|(k, v)| { let mut entry = k.into_vec(); @@ -1920,8 +1917,12 @@ pub mod module { target_os = "openbsd" ))] #[pyfunction] - fn getgrouplist(user: PyStrRef, group: u32, vm: &VirtualMachine) -> PyResult> { - let user = CString::new(user.as_str()).unwrap(); + fn getgrouplist( + user: PyUtf8StrRef, + group: u32, + vm: &VirtualMachine, + ) -> PyResult> { + let user = user.to_cstring(vm)?; let gid = Gid::from_raw(group); let group_ids = unistd::getgrouplist(&user, gid).map_err(|err| err.into_pyexception(vm))?; Ok(group_ids @@ -1954,7 +1955,6 @@ pub mod module { who: PriorityWhoType, vm: &VirtualMachine, ) -> PyResult { - use nix::errno::Errno; Errno::clear(); let retval = unsafe { libc::getpriority(which, who) }; if Errno::last_raw() != 0 { @@ -1987,7 +1987,7 @@ pub mod module { let i = match obj.downcast::() { Ok(int) => int.try_to_primitive(vm)?, Err(obj) => { - let s = obj.downcast::().map_err(|_| { + let s = obj.downcast::().map_err(|_| { vm.new_type_error( "configuration names must be strings or integers".to_owned(), ) @@ -2176,8 +2176,6 @@ pub mod module { PathconfName(name): PathconfName, vm: &VirtualMachine, ) -> PyResult> { - use nix::errno::Errno; - Errno::clear(); debug_assert_eq!(Errno::last_raw(), 0); let raw = match &path { @@ -2214,7 +2212,6 @@ pub mod module { #[pyattr] fn pathconf_names(vm: &VirtualMachine) -> PyDictRef { - use strum::IntoEnumIterator; let pathname = vm.ctx.new_dict(); for variant in PathconfVar::iter() { // get the name of variant as a string to use as the dictionary key @@ -2385,18 +2382,21 @@ pub mod module { let i = match obj.downcast::() { Ok(int) => int.try_to_primitive(vm)?, Err(obj) => { - let s = obj.downcast::().map_err(|_| { + let s = obj.downcast::().map_err(|_| { vm.new_type_error( "configuration names must be strings or integers".to_owned(), ) })?; - s.as_str().parse::().or_else(|_| { - if s.as_str() == "SC_PAGESIZE" { - Ok(SysconfVar::SC_PAGESIZE) - } else { - Err(vm.new_value_error("unrecognized configuration name")) - } - })? as i32 + { + let name = s.as_str(); + name.parse::().or_else(|_| { + if name == "SC_PAGESIZE" { + Ok(SysconfVar::SC_PAGESIZE) + } else { + Err(vm.new_value_error("unrecognized configuration name")) + } + })? as i32 + } } }; Ok(Self(i)) @@ -2415,7 +2415,6 @@ pub mod module { #[pyattr] fn sysconf_names(vm: &VirtualMachine) -> PyDictRef { - use strum::IntoEnumIterator; let names = vm.ctx.new_dict(); for variant in SysconfVar::iter() { // get the name of variant as a string to use as the dictionary key diff --git a/crates/vm/src/stdlib/signal.rs b/crates/vm/src/stdlib/signal.rs index 9e51cd3b425..e6ad7b53348 100644 --- a/crates/vm/src/stdlib/signal.rs +++ b/crates/vm/src/stdlib/signal.rs @@ -323,7 +323,7 @@ pub(crate) mod _signal { if ret != 0 { let err = std::io::Error::last_os_error(); let itimer_error = itimer_error(vm); - return Err(vm.new_exception_msg(itimer_error, err.to_string())); + return Err(vm.new_exception_msg(itimer_error, err.to_string().into())); } let old = unsafe { old.assume_init() }; Ok(itimerval_to_tuple(&old)) @@ -340,7 +340,7 @@ pub(crate) mod _signal { if ret != 0 { let err = std::io::Error::last_os_error(); let itimer_error = itimer_error(vm); - return Err(vm.new_exception_msg(itimer_error, err.to_string())); + return Err(vm.new_exception_msg(itimer_error, err.to_string().into())); } let old = unsafe { old.assume_init() }; Ok(itimerval_to_tuple(&old)) diff --git a/crates/vm/src/stdlib/sre.rs b/crates/vm/src/stdlib/sre.rs index eb0cb05eb7d..2c18bab4ba1 100644 --- a/crates/vm/src/stdlib/sre.rs +++ b/crates/vm/src/stdlib/sre.rs @@ -9,7 +9,7 @@ mod _sre { PyCallableIterator, PyDictRef, PyGenericAlias, PyInt, PyList, PyListRef, PyStr, PyStrRef, PyTuple, PyTupleRef, PyTypeRef, }, - common::wtf8::{Wtf8, Wtf8Buf}, + common::wtf8::{Wtf8, Wtf8Buf, wtf8_concat}, common::{ascii, hash::PyHash}, convert::ToPyObject, function::{ArgCallable, OptionalArg, PosArgs, PyComparisonValue}, @@ -552,7 +552,7 @@ mod _sre { impl Representable for Pattern { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { let flag_names = [ ("re.IGNORECASE", SreFlag::IGNORECASE), ("re.LOCALE", SreFlag::LOCALE), @@ -580,19 +580,19 @@ mod _sre { .join("|"); let pattern = zelf.pattern.repr(vm)?; - let truncated: String; - let s = if pattern.char_len() > 200 { - truncated = pattern.as_str().chars().take(200).collect(); - &truncated + let mut result = Wtf8Buf::from("re.compile("); + let pat = if pattern.char_len() > 200 { + pattern.as_wtf8().code_points().take(200).collect() } else { - pattern.as_str() + pattern.as_wtf8().to_owned() }; - - if flags.is_empty() { - Ok(format!("re.compile({s})")) - } else { - Ok(format!("re.compile({s}, {flags})")) + result.push_wtf8(&pat); + if !flags.is_empty() { + result.push_str(", "); + result.push_str(&flags); } + result.push_char(')'); + Ok(result) } } @@ -851,13 +851,17 @@ mod _sre { impl Representable for Match { #[inline] - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { with_sre_str!(zelf.pattern, &zelf.string, vm, |str_drive| { - Ok(format!( - "", + let match_repr = zelf.get_slice(0, str_drive, vm).unwrap().repr(vm)?; + Ok(wtf8_concat!( + "() { - write!(stderr, "{}", qualname_str.as_str()); + write!(stderr, "{}", qualname_str.as_wtf8()); } else { write!(stderr, "{}", unraisable.exc_type.name()); } @@ -1270,7 +1270,7 @@ mod sys { if !vm.is_none(&unraisable.exc_value) { write!(stderr, ": "); if let Ok(str) = unraisable.exc_value.str(vm) { - write!(stderr, "{}", str.to_str().unwrap_or("")); + write!(stderr, "{}", str.as_wtf8()); } else { write!(stderr, ""); } diff --git a/crates/vm/src/stdlib/thread.rs b/crates/vm/src/stdlib/thread.rs index ff6f0fd8b8f..21b19fb7560 100644 --- a/crates/vm/src/stdlib/thread.rs +++ b/crates/vm/src/stdlib/thread.rs @@ -12,7 +12,8 @@ pub(crate) use _thread::{ pub(crate) mod _thread { use crate::{ AsObject, Py, PyPayload, PyRef, PyResult, VirtualMachine, - builtins::{PyDictRef, PyStr, PyStrRef, PyTupleRef, PyType, PyTypeRef}, + builtins::{PyDictRef, PyStr, PyTupleRef, PyType, PyTypeRef, PyUtf8StrRef}, + common::wtf8::Wtf8Buf, frame::FrameRef, function::{ArgCallable, Either, FuncArgs, KwArgs, OptionalArg, PySetterValue}, types::{Constructor, GetAttr, Representable, SetAttr}, @@ -318,7 +319,7 @@ pub(crate) mod _thread { /// Set the name of the current thread #[pyfunction] - fn set_name(name: PyStrRef) { + fn set_name(name: PyUtf8StrRef) { #[cfg(target_os = "linux")] { use alloc::ffi::CString; @@ -710,11 +711,11 @@ pub(crate) mod _thread { .get_attr("name", vm) .ok() .and_then(|n| n.str(vm).ok()) - .map(|s| s.as_str().to_owned()) + .map(|s| s.as_wtf8().to_owned()) } else { None }; - let name = thread_name.unwrap_or_else(|| format!("{}", get_ident())); + let name = thread_name.unwrap_or_else(|| Wtf8Buf::from(format!("{}", get_ident()))); let _ = vm.call_method( &file, @@ -836,7 +837,7 @@ pub(crate) mod _thread { impl GetAttr for Local { fn getattro(zelf: &Py, attr: &Py, vm: &VirtualMachine) -> PyResult { let l_dict = zelf.l_dict(vm); - if attr.as_str() == "__dict__" { + if attr.as_bytes() == b"__dict__" { Ok(l_dict.into()) } else { zelf.as_object() @@ -859,7 +860,7 @@ pub(crate) mod _thread { value: PySetterValue, vm: &VirtualMachine, ) -> PyResult<()> { - if attr.as_str() == "__dict__" { + if attr.as_bytes() == b"__dict__" { Err(vm.new_attribute_error(format!( "{} attribute '__dict__' is read-only", zelf.class().name() diff --git a/crates/vm/src/stdlib/time.rs b/crates/vm/src/stdlib/time.rs index 05790fe332b..ee19fce7dac 100644 --- a/crates/vm/src/stdlib/time.rs +++ b/crates/vm/src/stdlib/time.rs @@ -101,7 +101,7 @@ mod decl { let dur = seconds.try_into_value::(vm).map_err(|e| { if e.class().is(vm.ctx.exceptions.value_error) && let Some(s) = e.args().first().and_then(|arg| arg.str(vm).ok()) - && s.as_str() == "negative duration" + && s.as_bytes() == b"negative duration" { return vm.new_value_error("sleep length must be non-negative"); } @@ -434,10 +434,11 @@ mod decl { #[cfg(unix)] { + use crate::builtins::PyUtf8StrRef; let zone = if t.tm_zone.is(&vm.ctx.none) { None } else { - let zone: PyStrRef = t + let zone: PyUtf8StrRef = t .tm_zone .clone() .try_into_value(vm) @@ -1047,7 +1048,7 @@ mod platform { #[cfg_attr(target_os = "macos", allow(unused_imports))] use crate::{ PyObject, PyRef, PyResult, TryFromBorrowedObject, VirtualMachine, - builtins::{PyNamespace, PyStrRef}, + builtins::{PyNamespace, PyUtf8StrRef}, convert::IntoPyException, }; use core::time::Duration; @@ -1216,8 +1217,8 @@ mod platform { target_os = "linux", ))] #[pyfunction] - fn get_clock_info(name: PyStrRef, vm: &VirtualMachine) -> PyResult> { - let (adj, imp, mono, res) = match name.as_ref() { + fn get_clock_info(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { + let (adj, imp, mono, res) = match name.as_str() { "monotonic" | "perf_counter" => ( false, "time.clock_gettime(CLOCK_MONOTONIC)", @@ -1263,7 +1264,7 @@ mod platform { target_os = "linux", )))] #[pyfunction] - fn get_clock_info(_name: PyStrRef, vm: &VirtualMachine) -> PyResult> { + fn get_clock_info(_name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { Err(vm.new_not_implemented_error("get_clock_info unsupported on this system")) } @@ -1330,7 +1331,7 @@ mod platform { use super::decl::{MS_TO_NS, SEC_TO_NS, StructTimeData, get_tz_info, time_muldiv}; use crate::{ PyRef, PyResult, VirtualMachine, - builtins::{PyNamespace, PyStrRef}, + builtins::{PyNamespace, PyUtf8StrRef}, }; use core::time::Duration; use windows_sys::Win32::{ @@ -1500,8 +1501,8 @@ mod platform { } #[pyfunction] - fn get_clock_info(name: PyStrRef, vm: &VirtualMachine) -> PyResult> { - let (adj, imp, mono, res) = match name.as_ref() { + fn get_clock_info(name: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult> { + let (adj, imp, mono, res) = match name.as_str() { "monotonic" => ( false, "GetTickCount64()", diff --git a/crates/vm/src/stdlib/typevar.rs b/crates/vm/src/stdlib/typevar.rs index 1b3b5fecdd4..d0bd3f5666d 100644 --- a/crates/vm/src/stdlib/typevar.rs +++ b/crates/vm/src/stdlib/typevar.rs @@ -30,7 +30,7 @@ pub(crate) mod typevar { contravariant: bool, ) -> String { if infer_variance { - return name.to_string(); + return name.to_owned(); } let prefix = if covariant { '+' @@ -62,23 +62,19 @@ pub(crate) mod typevar { fn set_module_from_caller(obj: &PyObject, vm: &VirtualMachine) -> PyResult<()> { // Note: CPython gets module from frame->f_funcobj, but RustPython's Frame // architecture is different - we use globals['__name__'] instead - if let Some(module_name) = caller(vm) { + let module_value: PyObjectRef = if let Some(module_name) = caller(vm) { // Special handling for certain module names - if let Ok(name_str) = module_name.str(vm) { - let name = name_str.as_str(); - // CPython sets __module__ to None for builtins and <...> modules - // Also set to None for exec contexts (no __name__ in globals means exec) - if name == "builtins" || name.starts_with('<') { - // Don't set __module__ attribute at all (CPython behavior) - // This allows the typing module to handle it - return Ok(()); - } + if let Ok(name_str) = module_name.str(vm) + && let Some(name) = name_str.to_str() + && (name == "builtins" || name.starts_with('<')) + { + return Ok(()); } - obj.set_attr("__module__", module_name, vm)?; + module_name } else { - // If no module name is found (e.g., in exec context), set __module__ to None - obj.set_attr("__module__", vm.ctx.none(), vm)?; - } + vm.ctx.none() + }; + obj.set_attr("__module__", module_value, vm)?; Ok(()) } @@ -288,7 +284,7 @@ pub(crate) mod typevar { impl Representable for TypeVar { #[inline(always)] fn repr_str(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { - let name = zelf.name.str(vm)?; + let name = zelf.name.str_utf8(vm)?; Ok(variance_repr( name.as_str(), zelf.infer_variance, @@ -684,7 +680,7 @@ pub(crate) mod typevar { impl Representable for ParamSpec { #[inline(always)] fn repr_str(zelf: &crate::Py, vm: &VirtualMachine) -> PyResult { - let name = zelf.__name__().str(vm)?; + let name = zelf.__name__().str_utf8(vm)?; Ok(variance_repr( name.as_str(), zelf.infer_variance, diff --git a/crates/vm/src/stdlib/typing.rs b/crates/vm/src/stdlib/typing.rs index bafc02e5764..2a1ed8e91a2 100644 --- a/crates/vm/src/stdlib/typing.rs +++ b/crates/vm/src/stdlib/typing.rs @@ -32,6 +32,7 @@ pub(crate) mod decl { use crate::{ AsObject, Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, atomic_func, builtins::{PyGenericAlias, PyStrRef, PyTuple, PyTupleRef, PyType, PyTypeRef, type_}, + common::wtf8::Wtf8Buf, function::FuncArgs, protocol::{PyMappingMethods, PyNumberMethods}, types::{AsMapping, AsNumber, Callable, Constructor, Iterable, Representable}, @@ -145,10 +146,14 @@ pub(crate) mod decl { && !vm.is_none(&module) && let Some(module_str) = module.downcast_ref::() { - if module_str.as_str() == "builtins" { - return Ok(qualname.str(vm)?.to_string()); + if module_str.as_bytes() == b"builtins" { + return Ok(qualname.str_utf8(vm)?.as_str().to_owned()); } - return Ok(format!("{}.{}", module_str.as_str(), qualname.str(vm)?)); + return Ok(format!( + "{}.{}", + module_str.as_wtf8(), + qualname.str_utf8(vm)?.as_str() + )); } // Fallback to repr Ok(obj.repr(vm)?.to_string()) @@ -426,8 +431,8 @@ pub(crate) mod decl { } impl Representable for TypeAliasType { - fn repr_str(zelf: &Py, _vm: &VirtualMachine) -> PyResult { - Ok(zelf.name.as_str().to_owned()) + fn repr_wtf8(zelf: &Py, _vm: &VirtualMachine) -> PyResult { + Ok(zelf.name.as_wtf8().to_owned()) } } diff --git a/crates/vm/src/stdlib/winreg.rs b/crates/vm/src/stdlib/winreg.rs index 1b165f911d0..026d8d38c63 100644 --- a/crates/vm/src/stdlib/winreg.rs +++ b/crates/vm/src/stdlib/winreg.rs @@ -994,7 +994,7 @@ mod winreg { let s = value .downcast::() .map_err(|_| vm.new_type_error("value must be a string".to_string()))?; - let wide = s.as_str().to_wide_with_nul(); + let wide = s.as_wtf8().to_wide_with_nul(); // Convert Vec to Vec let bytes: Vec = wide.iter().flat_map(|&c| c.to_le_bytes()).collect(); Ok(Some(bytes)) @@ -1013,7 +1013,7 @@ mod winreg { let s = item.downcast_ref::().ok_or_else(|| { vm.new_type_error("list items must be strings".to_string()) })?; - let wide = s.as_str().to_wide_with_nul(); + let wide = s.as_wtf8().to_wide_with_nul(); bytes.extend(wide.iter().flat_map(|&c| c.to_le_bytes())); } // Add final null terminator (double null at end) diff --git a/crates/vm/src/stdlib/winsound.rs b/crates/vm/src/stdlib/winsound.rs index 3f65abbb890..729305f879d 100644 --- a/crates/vm/src/stdlib/winsound.rs +++ b/crates/vm/src/stdlib/winsound.rs @@ -118,7 +118,7 @@ mod winsound { // os.fspath(sound) let path = match sound.downcast_ref::() { - Some(s) => s.as_str().to_owned(), + Some(s) => s.as_wtf8().to_owned(), None => { let fspath = vm.get_method_or_type_error( sound.clone(), @@ -151,12 +151,12 @@ mod winsound { )) })?; - s.as_str().to_owned() + s.as_wtf8().to_owned() } }; // Check for embedded null characters - if path.contains('\0') { + if path.as_bytes().contains(&0) { return Err(vm.new_value_error("embedded null character".to_owned())); } diff --git a/crates/vm/src/suggestion.rs b/crates/vm/src/suggestion.rs index 2d732160f07..b48b78af755 100644 --- a/crates/vm/src/suggestion.rs +++ b/crates/vm/src/suggestion.rs @@ -89,7 +89,7 @@ pub fn offer_suggestions(exc: &Py, vm: &VirtualMachine) -> Opti // Look up the module in sys.modules let sys_modules = vm.sys_module.get_attr("modules", vm).ok()?; - let module = sys_modules.get_item(mod_name_str.as_str(), vm).ok()?; + let module = sys_modules.get_item(mod_name_str, vm).ok()?; calculate_suggestions(vm.dir(Some(module)).ok()?.borrow_vec().iter(), &wrong_name) } else { diff --git a/crates/vm/src/types/slot.rs b/crates/vm/src/types/slot.rs index ae00158aeb4..84eaa817ac4 100644 --- a/crates/vm/src/types/slot.rs +++ b/crates/vm/src/types/slot.rs @@ -20,6 +20,7 @@ use crate::{ use core::{any::Any, any::TypeId, borrow::Borrow, cmp::Ordering, ops::Deref}; use crossbeam_utils::atomic::AtomicCell; use num_traits::{Signed, ToPrimitive}; +use rustpython_common::wtf8::Wtf8Buf; /// Type-erased storage for extension module data attached to heap types. pub struct TypeDataSlot { @@ -1548,8 +1549,11 @@ pub trait Initializer: PyPayload { #[cfg(debug_assertions)] { if let Ok(msg) = err.as_object().repr(vm) { - let double_appearance = - msg.as_str().matches(&class_name_for_debug as &str).count() == 2; + let double_appearance = msg + .to_string_lossy() + .matches(&class_name_for_debug as &str) + .count() + == 2; if double_appearance { panic!( "This type `{}` doesn't seem to support `init`. Override `slot_init` instead: {}", @@ -1596,12 +1600,14 @@ pub trait Callable: PyPayload { fn slot_call(zelf: &PyObject, args: FuncArgs, vm: &VirtualMachine) -> PyResult { let zelf = zelf.downcast_ref().ok_or_else(|| { let repr = zelf.repr(vm); - let help = if let Ok(repr) = repr.as_ref() { - repr.as_str().to_owned() + let help: Wtf8Buf = if let Ok(repr) = repr.as_ref() { + repr.as_wtf8().to_owned() } else { - zelf.class().name().to_owned() + zelf.class().name().to_owned().into() }; - vm.new_type_error(format!("unexpected payload for __call__ of {help}")) + let mut msg = Wtf8Buf::from("unexpected payload for __call__ of "); + msg.push_wtf8(&help); + vm.new_type_error(msg) })?; let args = args.bind(vm)?; Self::call(zelf, args, vm) @@ -1706,11 +1712,16 @@ pub trait Representable: PyPayload { #[inline] fn repr(zelf: &Py, vm: &VirtualMachine) -> PyResult> { - let repr = Self::repr_str(zelf, vm)?; + let repr = Self::repr_wtf8(zelf, vm)?; Ok(vm.ctx.new_str(repr)) } - fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult; + fn repr_wtf8(zelf: &Py, vm: &VirtualMachine) -> PyResult { + Self::repr_str(zelf, vm).map(|utf8| utf8.into()) + } + fn repr_str(_zelf: &Py, _vm: &VirtualMachine) -> PyResult { + unreachable!("Representable requires overriding either repr_str or repr_wtf8") + } } #[pyclass] @@ -2063,7 +2074,7 @@ where #[cold] fn slot_iter(zelf: PyObjectRef, vm: &VirtualMachine) -> PyResult { let repr = zelf.repr(vm)?; - unreachable!("slot must be overridden for {}", repr.as_str()); + unreachable!("slot must be overridden for {}", repr.as_wtf8()); } #[cold] diff --git a/crates/vm/src/types/structseq.rs b/crates/vm/src/types/structseq.rs index 3b89a62b9ad..0ac73c0fc19 100644 --- a/crates/vm/src/types/structseq.rs +++ b/crates/vm/src/types/structseq.rs @@ -234,7 +234,7 @@ pub trait PyStructSequence: StaticType + PyClassImpl + Sized + 'static { match typ.get_attr(identifier!(vm.ctx, __module__)) { Some(module) if module.downcastable::() => { let module_str = module.downcast_ref::().unwrap(); - alloc::borrow::Cow::Owned(format!("{}.{}", module_str.as_str(), Self::NAME)) + alloc::borrow::Cow::Owned(format!("{}.{}", module_str.as_wtf8(), Self::NAME)) } _ => alloc::borrow::Cow::Borrowed(Self::TP_NAME), } diff --git a/crates/vm/src/utils.rs b/crates/vm/src/utils.rs index db232e81949..b5117ddd8d1 100644 --- a/crates/vm/src/utils.rs +++ b/crates/vm/src/utils.rs @@ -1,4 +1,4 @@ -use rustpython_common::wtf8::Wtf8; +use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; use crate::{ PyObjectRef, PyResult, VirtualMachine, @@ -43,33 +43,31 @@ pub(crate) fn collection_repr<'a, I>( suffix: &str, iter: I, vm: &VirtualMachine, -) -> PyResult +) -> PyResult where I: core::iter::Iterator, { - let mut repr = String::new(); + let mut repr = Wtf8Buf::new(); if let Some(name) = class_name { repr.push_str(name); - repr.push('('); + repr.push_char('('); } repr.push_str(prefix); { let mut parts_iter = iter.map(|o| o.repr(vm)); - repr.push_str( - parts_iter - .next() - .transpose()? - .expect("this is not called for empty collection") - .as_str(), - ); + let first = parts_iter + .next() + .transpose()? + .expect("this is not called for empty collection"); + repr.push_wtf8(first.as_wtf8()); for part in parts_iter { repr.push_str(", "); - repr.push_str(part?.as_str()); + repr.push_wtf8(part?.as_wtf8()); } } repr.push_str(suffix); if class_name.is_some() { - repr.push(')'); + repr.push_char(')'); } Ok(repr) diff --git a/crates/vm/src/vm/context.rs b/crates/vm/src/vm/context.rs index db5406f045f..41d47ccfe0a 100644 --- a/crates/vm/src/vm/context.rs +++ b/crates/vm/src/vm/context.rs @@ -3,7 +3,7 @@ use crate::{ builtins::{ PyByteArray, PyBytes, PyComplex, PyDict, PyDictRef, PyEllipsis, PyFloat, PyFrozenSet, PyInt, PyIntRef, PyList, PyListRef, PyNone, PyNotImplemented, PyStr, PyStrInterned, - PyTuple, PyTupleRef, PyType, PyTypeRef, + PyTuple, PyTupleRef, PyType, PyTypeRef, PyUtf8Str, bool_::PyBool, code::{self, PyCode}, descriptor::{ @@ -447,6 +447,11 @@ impl Context { s.into().into_ref(self) } + #[inline] + pub fn new_utf8_str(&self, s: impl Into) -> PyRef { + s.into().into_ref(self) + } + pub fn interned_or_new_str(&self, s: S) -> PyRef where S: Into + AsRef, diff --git a/crates/vm/src/vm/interpreter.rs b/crates/vm/src/vm/interpreter.rs index f6ce3448c03..f6d3ee4bc0a 100644 --- a/crates/vm/src/vm/interpreter.rs +++ b/crates/vm/src/vm/interpreter.rs @@ -562,7 +562,7 @@ mod tests { let b = vm.new_pyobj(4_i32); let res = vm._mul(&a, &b).unwrap(); let value = res.downcast_ref::().unwrap(); - assert_eq!(value.as_str(), "Hello Hello Hello Hello ") + assert_eq!(value.as_wtf8(), "Hello Hello Hello Hello ") }) } } diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index 3285885eea2..9802832ceab 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -20,7 +20,7 @@ use crate::{ AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, builtins::{ self, PyBaseExceptionRef, PyDict, PyDictRef, PyInt, PyList, PyModule, PyStr, PyStrInterned, - PyStrRef, PyTypeRef, PyWeak, + PyStrRef, PyTypeRef, PyUtf8Str, PyUtf8StrInterned, PyWeak, code::PyCode, dict::{PyDictItems, PyDictKeys, PyDictValues}, pystr::AsPyStr, @@ -34,7 +34,7 @@ use crate::{ frozen::FrozenModule, function::{ArgMapping, FuncArgs, PySetterValue}, import, - protocol::PyIterIter, + protocol::{PyIterIter, TraceEvent}, scope::Scope, signal, stdlib, warn::WarningsState, @@ -635,31 +635,28 @@ impl VirtualMachine { } }; - // Format: "Exception ignored {msg} {object_repr}\n" - if let Some(msg) = msg { - write_to_stderr(&format!("Exception ignored {}", msg), &stderr, self); + let msg_str = if let Some(msg) = msg { + format!("{msg}: ") } else { - write_to_stderr("Exception ignored in: ", &stderr, self); - } + "Exception ignored in: ".to_owned() + }; + write_to_stderr(&msg_str, &stderr, self); - if let Ok(repr) = object.repr(self) { - write_to_stderr(&format!("{}\n", repr.as_str()), &stderr, self); - } else { - write_to_stderr("\n", &stderr, self); - } + let repr_result = object.repr(self); + let repr_wtf8 = repr_result + .as_ref() + .map_or("".as_ref(), |s| s.as_wtf8()); + write_to_stderr(&format!("{repr_wtf8}\n"), &stderr, self); // Write exception type and message let exc_type_name = e.class().name(); - if let Ok(exc_str) = e.as_object().str(self) { - let exc_str = exc_str.as_str(); - if exc_str.is_empty() { - write_to_stderr(&format!("{}\n", exc_type_name), &stderr, self); - } else { - write_to_stderr(&format!("{}: {}\n", exc_type_name, exc_str), &stderr, self); + let msg = match e.as_object().str(self) { + Ok(exc_str) if !exc_str.as_wtf8().is_empty() => { + format!("{}: {}\n", exc_type_name, exc_str.as_wtf8()) } - } else { - write_to_stderr(&format!("{}\n", exc_type_name), &stderr, self); - } + _ => format!("{}\n", exc_type_name), + }; + write_to_stderr(&msg, &stderr, self); // Flush stderr to ensure output is visible if let Some(ref stderr) = stderr { @@ -810,7 +807,7 @@ impl VirtualMachine { for (key, value) in items { let name = key - .downcast_ref::() + .downcast_ref::() .map(|s| s.as_str().to_owned()) .unwrap_or_default(); @@ -873,9 +870,9 @@ impl VirtualMachine { continue; } if let Some(key_str) = key.downcast_ref::() { - let name = key_str.as_str(); - if name.starts_with('_') && name != "__builtins__" { - let _ = dict.set_item(name, none.clone(), vm); + let name = key_str.as_wtf8(); + if name.starts_with("_") && name != "__builtins__" { + let _ = dict.set_item(key_str, none.clone(), vm); } } } @@ -886,9 +883,9 @@ impl VirtualMachine { continue; } if let Some(key_str) = key.downcast_ref::() - && key_str.as_str() != "__builtins__" + && key_str.as_bytes() != b"__builtins__" { - let _ = dict.set_item(key_str.as_str(), none.clone(), vm); + let _ = dict.set_item(key_str.as_wtf8(), none.clone(), vm); } } } @@ -1078,7 +1075,6 @@ impl VirtualMachine { crate::vm::thread::pop_thread_frame(); } - use crate::protocol::TraceEvent; // Fire 'call' trace event after pushing frame // (current_frame() now returns the callee's frame) match self.trace_event(TraceEvent::Call, None) { @@ -1624,11 +1620,11 @@ impl VirtualMachine { identifier!(self, utf_8) } - pub fn fs_encode_errors(&self) -> &'static PyStrInterned { + pub fn fs_encode_errors(&self) -> &'static PyUtf8StrInterned { if cfg!(windows) { - identifier!(self, surrogatepass) + identifier_utf8!(self, surrogatepass) } else { - identifier!(self, surrogateescape) + identifier_utf8!(self, surrogateescape) } } @@ -1730,7 +1726,7 @@ fn frozen_origname_matches() { .unwrap() .try_into_value(vm) .unwrap(); - assert_eq!(origname.as_str(), expected); + assert_eq!(origname.as_wtf8(), expected); }; check("_frozen_importlib", "importlib._bootstrap"); diff --git a/crates/vm/src/vm/vm_new.rs b/crates/vm/src/vm/vm_new.rs index 9e33f430945..2c9320b4d87 100644 --- a/crates/vm/src/vm/vm_new.rs +++ b/crates/vm/src/vm/vm_new.rs @@ -13,6 +13,7 @@ use crate::{ scope::Scope, vm::VirtualMachine, }; +use rustpython_common::wtf8::Wtf8Buf; use rustpython_compiler_core::SourceLocation; macro_rules! define_exception_fn { @@ -24,7 +25,7 @@ macro_rules! define_exception_fn { stringify!($python_repr), " object.\nUseful for raising errors from python functions implemented in rust." )] - pub fn $fn_name(&self, msg: impl Into) -> PyBaseExceptionRef + pub fn $fn_name(&self, msg: impl Into) -> PyBaseExceptionRef { let err = self.ctx.exceptions.$attr.to_owned(); self.new_exception_msg(err, msg.into()) @@ -150,7 +151,7 @@ impl VirtualMachine { /// type is passed in, it may not be fully initialized; try using /// [`vm.invoke_exception()`][Self::invoke_exception] or /// [`exceptions::ExceptionCtor`][crate::exceptions::ExceptionCtor] instead. - pub fn new_exception_msg(&self, exc_type: PyTypeRef, msg: String) -> PyBaseExceptionRef { + pub fn new_exception_msg(&self, exc_type: PyTypeRef, msg: Wtf8Buf) -> PyBaseExceptionRef { self.new_exception(exc_type, vec![self.ctx.new_str(msg).into()]) } @@ -162,7 +163,7 @@ impl VirtualMachine { pub fn new_exception_msg_dict( &self, exc_type: PyTypeRef, - msg: String, + msg: Wtf8Buf, dict: PyDictRef, ) -> PyBaseExceptionRef { PyRef::new_ref( @@ -189,7 +190,7 @@ impl VirtualMachine { attribute_error } - pub fn new_name_error(&self, msg: impl Into, name: PyStrRef) -> PyBaseExceptionRef { + pub fn new_name_error(&self, msg: impl Into, name: PyStrRef) -> PyBaseExceptionRef { let name_error_type = self.ctx.exceptions.name_error.to_owned(); let name_error = self.new_exception_msg(name_error_type, msg.into()); name_error.as_object().set_attr("name", name, self).unwrap(); @@ -619,7 +620,7 @@ impl VirtualMachine { if syntax_error_type.is(self.ctx.exceptions.tab_error) { msg = "inconsistent use of tabs and spaces in indentation".to_owned(); } - let syntax_error = self.new_exception_msg(syntax_error_type, msg); + let syntax_error = self.new_exception_msg(syntax_error_type, msg.into()); let (lineno, offset) = error.python_location(); let lineno = self.ctx.new_int(lineno); let offset = self.ctx.new_int(offset); @@ -689,10 +690,14 @@ impl VirtualMachine { self.new_syntax_error_maybe_incomplete(error, source, false) } - pub fn new_import_error(&self, msg: impl Into, name: PyStrRef) -> PyBaseExceptionRef { + pub fn new_import_error( + &self, + msg: impl Into, + name: impl Into, + ) -> PyBaseExceptionRef { let import_error = self.ctx.exceptions.import_error.to_owned(); let exc = self.new_exception_msg(import_error, msg.into()); - exc.as_object().set_attr("name", name, self).unwrap(); + exc.as_object().set_attr("name", name.into(), self).unwrap(); exc } @@ -733,7 +738,7 @@ impl VirtualMachine { } else { msg }; - self.new_exception_msg(error_type.to_owned(), msg) + self.new_exception_msg(error_type.to_owned(), msg.into()) } pub(crate) fn new_downcast_runtime_error( diff --git a/crates/vm/src/warn.rs b/crates/vm/src/warn.rs index 684630e6af0..7cefed3b1ae 100644 --- a/crates/vm/src/warn.rs +++ b/crates/vm/src/warn.rs @@ -392,23 +392,24 @@ pub(crate) fn warn_explicit( let action_str = PyStrRef::try_from_object(vm, action) .map_err(|_| vm.new_type_error("action must be a string".to_owned()))?; - if action_str.as_str() == "error" { + if action_str.as_bytes() == b"error" { let exc = PyBaseExceptionRef::try_from_object(vm, message)?; return Err(exc); } - if action_str.as_str() == "ignore" { + if action_str.as_bytes() == b"ignore" { return Ok(()); } // For everything except "always"/"all", record in registry then // check per-action registries. - let already = if action_str.as_str() != "always" && action_str.as_str() != "all" { + let already = if action_str.as_wtf8() != "always" && action_str.as_wtf8() != "all" { if !vm.is_none(®istry) { registry.set_item(&*key, vm.ctx.true_value.clone().into(), vm)?; } - match action_str.as_str() { - "once" => { + let action_s = action_str.to_str(); + match action_s { + Some("once") => { let reg = if vm.is_none(®istry) { get_once_registry(vm)? } else { @@ -416,17 +417,17 @@ pub(crate) fn warn_explicit( }; update_registry(®, text.as_ref(), category.as_object(), false, vm)? } - "module" => { + Some("module") => { if !vm.is_none(®istry) { update_registry(®istry, text.as_ref(), category.as_object(), false, vm)? } else { false } } - "default" => false, - other => { + Some("default") => false, + _ => { return Err(vm.new_runtime_error(format!( - "Unrecognized action ({other}) in warnings.filters:\n {other}" + "Unrecognized action ({action_str}) in warnings.filters:\n {action_str}" ))); } } @@ -517,11 +518,11 @@ fn show_warning( /// Check if a frame's filename starts with any of the given prefixes. fn is_filename_to_skip(frame: &crate::frame::Frame, prefixes: &PyTupleRef) -> bool { let filename = frame.f_code().co_filename(); - let filename_s = filename.as_str(); + let filename_bytes = filename.as_bytes(); prefixes.iter().any(|prefix| { prefix .downcast_ref::() - .is_some_and(|s| filename_s.starts_with(s.as_str())) + .is_some_and(|s| filename_bytes.starts_with(s.as_bytes())) }) } diff --git a/crates/wasm/src/browser_module.rs b/crates/wasm/src/browser_module.rs index d1eecce28a9..3ec319e10d0 100644 --- a/crates/wasm/src/browser_module.rs +++ b/crates/wasm/src/browser_module.rs @@ -64,22 +64,19 @@ mod _browser { } = args; let response_format = match response_format { - Some(s) => FetchResponseFormat::from_str(vm, s.as_str())?, + Some(s) => FetchResponseFormat::from_str(vm, s.expect_str())?, None => FetchResponseFormat::Text, }; let opts = web_sys::RequestInit::new(); - match method { - Some(s) => opts.set_method(s.as_str()), - None => opts.set_method("GET"), - }; + opts.set_method(method.as_ref().map_or("GET", |s| s.expect_str())); if let Some(body) = body { opts.set_body(&convert::py_to_js(vm, body)); } - let request = web_sys::Request::new_with_str_and_init(url.as_str(), &opts) + let request = web_sys::Request::new_with_str_and_init(url.expect_str(), &opts) .map_err(|err| convert::js_py_typeerror(vm, err))?; if let Some(headers) = headers { @@ -87,7 +84,7 @@ mod _browser { for (key, value) in headers { let key = key.str(vm)?; let value = value.str(vm)?; - h.set(key.as_str(), value.as_str()) + h.set(key.expect_str(), value.expect_str()) .map_err(|err| convert::js_py_typeerror(vm, err))?; } } @@ -95,7 +92,7 @@ mod _browser { if let Some(content_type) = content_type { request .headers() - .set("Content-Type", content_type.as_str()) + .set("Content-Type", content_type.expect_str()) .map_err(|err| convert::js_py_typeerror(vm, err))?; } @@ -171,7 +168,7 @@ mod _browser { fn query(&self, query: PyStrRef, vm: &VirtualMachine) -> PyResult { let elem = self .doc - .query_selector(query.as_str()) + .query_selector(query.expect_str()) .map_err(|err| convert::js_py_typeerror(vm, err))? .map(|elem| Element { elem }) .to_pyobject(vm); @@ -206,7 +203,7 @@ mod _browser { default: OptionalArg, vm: &VirtualMachine, ) -> PyObjectRef { - match self.elem.get_attribute(attr.as_str()) { + match self.elem.get_attribute(attr.expect_str()) { Some(s) => vm.ctx.new_str(s).into(), None => default.unwrap_or_none(vm), } @@ -215,7 +212,7 @@ mod _browser { #[pymethod] fn set_attr(&self, attr: PyStrRef, value: PyStrRef, vm: &VirtualMachine) -> PyResult<()> { self.elem - .set_attribute(attr.as_str(), value.as_str()) + .set_attribute(attr.expect_str(), value.expect_str()) .map_err(|err| convert::js_py_typeerror(vm, err)) } } @@ -227,7 +224,7 @@ mod _browser { let opts = web_sys::RequestInit::new(); opts.set_method("GET"); - let request = web_sys::Request::new_with_str_and_init(path.as_str(), &opts) + let request = web_sys::Request::new_with_str_and_init(path.expect_str(), &opts) .map_err(|err| convert::js_py_typeerror(vm, err))?; let window = window(); @@ -244,7 +241,7 @@ mod _browser { .expect("that the vm is valid when the promise resolves"); stored_vm.interp.enter(move |vm| { let resp_text = text.as_string().unwrap(); - let res = import_source(vm, module.as_str(), &resp_text); + let res = import_source(vm, module.expect_str(), &resp_text); match res { Ok(_) => Ok(JsValue::null()), Err(err) => Err(convert::py_err_to_js_err(vm, &err)), diff --git a/crates/wasm/src/convert.rs b/crates/wasm/src/convert.rs index f84b0d46239..a0186ce2834 100644 --- a/crates/wasm/src/convert.rs +++ b/crates/wasm/src/convert.rs @@ -56,7 +56,7 @@ pub fn py_err_to_js_err(vm: &VirtualMachine, py_err: &Py) -> Js } pub fn js_py_typeerror(vm: &VirtualMachine, js_err: JsValue) -> PyBaseExceptionRef { - let msg = js_err.unchecked_into::().to_string(); + let msg: String = js_err.unchecked_into::().to_string().into(); vm.new_type_error(msg) } @@ -70,11 +70,11 @@ pub fn js_err_to_py_err(vm: &VirtualMachine, js_err: &JsValue) -> PyBaseExceptio _ => vm.ctx.exceptions.exception_type, } .to_owned(); - vm.new_exception_msg(exc_type, err.message().into()) + vm.new_exception_msg(exc_type, String::from(err.message()).into()) } None => vm.new_exception_msg( vm.ctx.exceptions.exception_type.to_owned(), - format!("{js_err:?}"), + format!("{js_err:?}").into(), ), } } diff --git a/crates/wasm/src/js_module.rs b/crates/wasm/src/js_module.rs index 750e85994a1..314e2adbee0 100644 --- a/crates/wasm/src/js_module.rs +++ b/crates/wasm/src/js_module.rs @@ -83,7 +83,7 @@ mod _js { impl JsProperty { fn into_js_value(self) -> JsValue { match self { - JsProperty::Str(s) => s.as_str().into(), + JsProperty::Str(s) => s.expect_str().into(), JsProperty::Js(value) => value.value.clone(), } } @@ -109,7 +109,7 @@ mod _js { #[pymethod] fn new_from_str(&self, s: PyStrRef) -> PyJsValue { - PyJsValue::new(s.as_str()) + PyJsValue::new(s.expect_str()) } #[pymethod] diff --git a/crates/wasm/src/wasm_builtins.rs b/crates/wasm/src/wasm_builtins.rs index 79423fc250e..efbc03c39ce 100644 --- a/crates/wasm/src/wasm_builtins.rs +++ b/crates/wasm/src/wasm_builtins.rs @@ -33,7 +33,7 @@ pub fn make_stdout_object( "write", cls, move |_self: PyObjectRef, data: PyStrRef, vm: &VirtualMachine| -> PyResult<()> { - write_f(data.as_str(), vm) + write_f(data.expect_str(), vm) }, ); let flush_method = vm.new_method("flush", cls, |_self: PyObjectRef| {}); diff --git a/crates/wtf8/src/concat.rs b/crates/wtf8/src/concat.rs new file mode 100644 index 00000000000..055a401d69f --- /dev/null +++ b/crates/wtf8/src/concat.rs @@ -0,0 +1,143 @@ +use alloc::borrow::{Cow, ToOwned}; +use alloc::boxed::Box; +use alloc::string::String; +use core::fmt; +use fmt::Write; + +use crate::{CodePoint, Wtf8, Wtf8Buf}; + +impl fmt::Write for Wtf8Buf { + #[inline] + fn write_str(&mut self, s: &str) -> fmt::Result { + self.push_str(s); + Ok(()) + } +} + +/// Trait for types that can be appended to a [`Wtf8Buf`], preserving surrogates. +pub trait Wtf8Concat { + fn fmt_wtf8(&self, buf: &mut Wtf8Buf); +} + +impl Wtf8Concat for Wtf8 { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_wtf8(self); + } +} + +impl Wtf8Concat for Wtf8Buf { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_wtf8(self); + } +} + +impl Wtf8Concat for str { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_str(self); + } +} + +impl Wtf8Concat for String { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_str(self); + } +} + +impl Wtf8Concat for char { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push_char(*self); + } +} + +impl Wtf8Concat for CodePoint { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + buf.push(*self); + } +} + +/// Wrapper that appends a [`fmt::Display`] value to a [`Wtf8Buf`]. +/// +/// Note: This goes through UTF-8 formatting, so lone surrogates in the +/// display output will be replaced with U+FFFD. Use direct [`Wtf8Concat`] +/// impls for surrogate-preserving concatenation. +#[allow(dead_code)] +pub struct DisplayAsWtf8(pub T); + +impl Wtf8Concat for DisplayAsWtf8 { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + write!(buf, "{}", self.0).unwrap(); + } +} + +macro_rules! impl_wtf8_concat_for_int { + ($($t:ty),*) => { + $(impl Wtf8Concat for $t { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + write!(buf, "{}", self).unwrap(); + } + })* + }; +} + +impl_wtf8_concat_for_int!( + u8, u16, u32, u64, u128, usize, i8, i16, i32, i64, i128, isize, f32, f64 +); + +impl Wtf8Concat for &T { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + (**self).fmt_wtf8(buf); + } +} + +impl Wtf8Concat for &mut T { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + (**self).fmt_wtf8(buf); + } +} + +impl Wtf8Concat for Box { + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + (**self).fmt_wtf8(buf); + } +} + +impl Wtf8Concat for Cow<'_, T> +where + T: ToOwned, +{ + #[inline] + fn fmt_wtf8(&self, buf: &mut Wtf8Buf) { + (**self).fmt_wtf8(buf); + } +} + +/// Concatenate values into a [`Wtf8Buf`], preserving surrogates. +/// +/// Each argument must implement [`Wtf8Concat`]. String literals (`&str`), +/// [`Wtf8`], [`Wtf8Buf`], [`char`], and [`CodePoint`] are all supported. +/// +/// ``` +/// use rustpython_wtf8::Wtf8Buf; +/// let name = "world"; +/// let result = rustpython_wtf8::wtf8_concat!("hello, ", name, "!"); +/// assert_eq!(result, Wtf8Buf::from("hello, world!")); +/// ``` +#[macro_export] +macro_rules! wtf8_concat { + ($($arg:expr),* $(,)?) => {{ + let mut buf = $crate::Wtf8Buf::new(); + $($crate::Wtf8Concat::fmt_wtf8(&$arg, &mut buf);)* + buf + }}; +} diff --git a/crates/wtf8/src/lib.rs b/crates/wtf8/src/lib.rs index 6614ca83572..772a2879944 100644 --- a/crates/wtf8/src/lib.rs +++ b/crates/wtf8/src/lib.rs @@ -330,6 +330,24 @@ impl Wtf8Buf { } } + pub fn join(sep: impl AsRef, iter: I) -> Wtf8Buf + where + I: IntoIterator, + S: AsRef, + { + let sep = sep.as_ref(); + let mut iter = iter.into_iter(); + let mut buf = match iter.next() { + Some(first) => first.as_ref().to_owned(), + None => return Wtf8Buf::new(), + }; + for part in iter { + buf.push_wtf8(sep); + buf.push_wtf8(part.as_ref()); + } + buf + } + pub fn clear(&mut self) { self.bytes.clear(); } @@ -979,6 +997,28 @@ impl Wtf8 { } } + pub fn to_lowercase(&self) -> Wtf8Buf { + let mut buf = Wtf8Buf::with_capacity(self.len()); + for chunk in self.chunks() { + match chunk { + Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_lowercase()), + Wtf8Chunk::Surrogate(c) => buf.push(c), + } + } + buf + } + + pub fn to_uppercase(&self) -> Wtf8Buf { + let mut buf = Wtf8Buf::with_capacity(self.len()); + for chunk in self.chunks() { + match chunk { + Wtf8Chunk::Utf8(s) => buf.push_str(&s.to_uppercase()), + Wtf8Chunk::Surrogate(c) => buf.push(c), + } + } + buf + } + #[inline] pub const fn is_ascii(&self) -> bool { self.bytes.is_ascii() @@ -1114,23 +1154,23 @@ impl Wtf8 { } } - pub fn ends_with(&self, w: &Wtf8) -> bool { - self.bytes.ends_with_str(w) + pub fn ends_with(&self, w: impl AsRef) -> bool { + self.bytes.ends_with_str(w.as_ref()) } - pub fn starts_with(&self, w: &Wtf8) -> bool { - self.bytes.starts_with_str(w) + pub fn starts_with(&self, w: impl AsRef) -> bool { + self.bytes.starts_with_str(w.as_ref()) } - pub fn strip_prefix(&self, w: &Wtf8) -> Option<&Self> { + pub fn strip_prefix(&self, w: impl AsRef) -> Option<&Self> { self.bytes - .strip_prefix(w.as_bytes()) + .strip_prefix(w.as_ref().as_bytes()) .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) }) } - pub fn strip_suffix(&self, w: &Wtf8) -> Option<&Self> { + pub fn strip_suffix(&self, w: impl AsRef) -> Option<&Self> { self.bytes - .strip_suffix(w.as_bytes()) + .strip_suffix(w.as_ref().as_bytes()) .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) }) } @@ -1520,6 +1560,14 @@ impl From<&Wtf8> for Box { } } +impl<'a> From<&'a str> for &'a Wtf8 { + #[inline] + fn from(s: &'a str) -> &'a Wtf8 { + // Valid UTF-8 is always valid WTF-8 + unsafe { Wtf8::from_bytes_unchecked(s.as_bytes()) } + } +} + impl From<&str> for Box { fn from(s: &str) -> Self { Box::::from(s).into() @@ -1561,3 +1609,6 @@ impl From for Box { s.into_boxed_str().into() } } + +mod concat; +pub use concat::Wtf8Concat; diff --git a/example_projects/wasm32_without_js/rustpython-without-js/src/lib.rs b/example_projects/wasm32_without_js/rustpython-without-js/src/lib.rs index 0a8695fd7fb..a152ebe1c4a 100644 --- a/example_projects/wasm32_without_js/rustpython-without-js/src/lib.rs +++ b/example_projects/wasm32_without_js/rustpython-without-js/src/lib.rs @@ -25,7 +25,7 @@ pub unsafe extern "C" fn eval(s: *const u8, l: usize) -> i32 { Err(_) => return Err(-1), // Python execution error }; let repr_str = match res.repr(vm) { - Ok(repr) => repr.as_str().to_string(), + Ok(repr) => repr.to_string(), Err(_) => return Err(-1), // Failed to get string representation }; Ok(repr_str) diff --git a/src/lib.rs b/src/lib.rs index 60b66d83b3d..07bfc45e59f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,7 +33,7 @@ //! #[pyfunction] //! fn other_thing(s: PyStrRef) -> (String, usize) { //! let new_string = format!("hello from rust, {}!", s); -//! let prev_len = s.as_str().len(); +//! let prev_len = s.byte_len(); //! (new_string, prev_len) //! } //! } @@ -135,7 +135,7 @@ __import__("io").TextIOWrapper( .downcast() .expect("TextIOWrapper.read() should return str"); eprintln!("running get-pip.py..."); - vm.run_string(scope, getpip_code.as_str(), "get-pip.py".to_owned())?; + vm.run_string(scope, getpip_code.expect_str(), "get-pip.py".to_owned())?; Ok(()) } @@ -143,7 +143,7 @@ fn install_pip(installer: InstallPipMode, scope: Scope, vm: &VirtualMachine) -> if !cfg!(feature = "ssl") { return Err(vm.new_exception_msg( vm.ctx.exceptions.system_error.to_owned(), - "install-pip requires rustpython be build with '--features=ssl'".to_owned(), + "install-pip requires rustpython be build with '--features=ssl'".into(), )); } diff --git a/src/shell.rs b/src/shell.rs index d2e54c490a8..6c75e94572c 100644 --- a/src/shell.rs +++ b/src/shell.rs @@ -142,7 +142,7 @@ pub fn run_shell(vm: &VirtualMachine, scope: Scope) -> PyResult<()> { .get_attr(prompt_name, vm) .and_then(|prompt| prompt.str(vm)); let prompt = match prompt { - Ok(ref s) => s.as_str(), + Ok(ref s) => s.expect_str(), Err(_) => "", }; @@ -211,8 +211,10 @@ pub fn run_shell(vm: &VirtualMachine, scope: Scope) -> PyResult<()> { } #[cfg(unix)] ReadlineResult::OsError(num) => { - let os_error = - vm.new_exception_msg(vm.ctx.exceptions.os_error.to_owned(), format!("{num:?}")); + let os_error = vm.new_exception_msg( + vm.ctx.exceptions.os_error.to_owned(), + format!("{num:?}").into(), + ); vm.print_exception(os_error); break; } diff --git a/src/shell/helper.rs b/src/shell/helper.rs index 493dc578469..944e936397d 100644 --- a/src/shell/helper.rs +++ b/src/shell/helper.rs @@ -108,7 +108,7 @@ impl<'vm> ShellHelper<'vm> { .filter(|res| { res.as_ref() .ok() - .is_none_or(|s| s.as_str().starts_with(word_start)) + .is_none_or(|s| s.as_bytes().starts_with(word_start.as_bytes())) }) .collect::, _>>() .ok()?; @@ -120,7 +120,7 @@ impl<'vm> ShellHelper<'vm> { // only the completions that don't start with a '_' let no_underscore = all_completions .iter() - .filter(|&s| !s.as_str().starts_with('_')) + .filter(|&s| !s.as_bytes().starts_with(b"_")) .cloned() .collect::>(); @@ -134,13 +134,13 @@ impl<'vm> ShellHelper<'vm> { }; // sort the completions alphabetically - completions.sort_by(|a, b| std::cmp::Ord::cmp(a.as_str(), b.as_str())); + completions.sort_by(|a, b| a.as_wtf8().cmp(b.as_wtf8())); Some(( startpos, completions .into_iter() - .map(|s| s.as_str().to_owned()) + .map(|s| s.expect_str().to_owned()) .collect(), )) }