diff --git a/Lib/test/test_importlib/source/test_source_encoding.py b/Lib/test/test_importlib/source/test_source_encoding.py index d65d51d0cca..c09c9aa12b8 100644 --- a/Lib/test/test_importlib/source/test_source_encoding.py +++ b/Lib/test/test_importlib/source/test_source_encoding.py @@ -62,14 +62,12 @@ def test_default_encoding(self): self.run_test(self.source_line.encode('utf-8')) # [encoding first line] - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 17 def test_encoding_on_first_line(self): encoding = 'Latin-1' source = self.create_source(encoding) self.run_test(source) # [encoding second line] - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 34 def test_encoding_on_second_line(self): source = b"#/usr/bin/python\n" + self.create_source('Latin-1') self.run_test(source) @@ -84,7 +82,6 @@ def test_bom_and_utf_8(self): self.run_test(source) # [BOM conflict] - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 20 def test_bom_conflict(self): source = codecs.BOM_UTF8 + self.create_source('latin-1') with self.assertRaises(SyntaxError): diff --git a/Lib/test/test_runpy.py b/Lib/test/test_runpy.py index 1b77b102577..cf7fd581ec3 100644 --- a/Lib/test/test_runpy.py +++ b/Lib/test/test_runpy.py @@ -752,7 +752,6 @@ def test_main_recursion_error(self): with infinite_recursion(25): self.assertRaises(RecursionError, run_path, zip_name) - @unittest.expectedFailure # TODO: RUSTPYTHON; detect encoding comments in files def test_encoding(self): with temp_dir() as script_dir: filename = os.path.join(script_dir, 'script.py') diff --git a/Lib/test/test_utf8source.py b/Lib/test/test_utf8source.py index 602b4b69aac..7336cf00a71 100644 --- a/Lib/test/test_utf8source.py +++ b/Lib/test/test_utf8source.py @@ -1,5 +1,3 @@ -# This file is marked as binary in the CVS, to prevent MacCVS from recoding it. - import unittest class PEP3120Test(unittest.TestCase): @@ -14,11 +12,9 @@ def test_pep3120(self): b'\\\xd0\x9f' ) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_badsyntax(self): try: - import test.badsyntax_pep3120 + import test.tokenizedata.badsyntax_pep3120 # noqa: F401 except SyntaxError as msg: msg = str(msg).lower() self.assertTrue('utf-8' in msg) @@ -28,8 +24,6 @@ def test_badsyntax(self): class BuiltinCompileTests(unittest.TestCase): - # TODO: RUSTPYTHON - @unittest.expectedFailure # Issue 3574. def test_latin1(self): # Allow compile() to read Latin-1 source. diff --git a/crates/vm/src/stdlib/builtins.rs b/crates/vm/src/stdlib/builtins.rs index c09e025a44a..2d81f01f305 100644 --- a/crates/vm/src/stdlib/builtins.rs +++ b/crates/vm/src/stdlib/builtins.rs @@ -111,6 +111,145 @@ mod builtins { _feature_version: OptionalArg, } + /// Detect PEP 263 encoding cookie from source bytes. + /// Checks first two lines for `# coding[:=] ` pattern. + /// Returns the encoding name if found, or None for default (UTF-8). + #[cfg(feature = "parser")] + fn detect_source_encoding(source: &[u8]) -> Option { + fn find_encoding_in_line(line: &[u8]) -> Option { + // PEP 263: '#' must be preceded only by whitespace/formfeed + let hash_pos = line.iter().position(|&b| b == b'#')?; + if !line[..hash_pos] + .iter() + .all(|&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r') + { + return None; + } + let after_hash = &line[hash_pos..]; + + // Find "coding" after the # + let coding_pos = after_hash.windows(6).position(|w| w == b"coding")?; + let after_coding = &after_hash[coding_pos + 6..]; + + // Next char must be ':' or '=' + let rest = if after_coding.first() == Some(&b':') || after_coding.first() == Some(&b'=') + { + &after_coding[1..] + } else { + return None; + }; + + // Skip whitespace + let rest = rest + .iter() + .copied() + .skip_while(|&b| b == b' ' || b == b'\t') + .collect::>(); + + // Read encoding name: [-\w.]+ + let name: String = rest + .iter() + .take_while(|&&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.') + .map(|&b| b as char) + .collect(); + + if name.is_empty() { None } else { Some(name) } + } + + // Split into lines (first two only) + let mut lines = source.splitn(3, |&b| b == b'\n'); + + if let Some(first) = lines.next() { + // Strip BOM if present + let first = first.strip_prefix(b"\xef\xbb\xbf").unwrap_or(first); + if let Some(enc) = find_encoding_in_line(first) { + return Some(enc); + } + // Only check second line if first line is blank or a comment + let trimmed = first + .iter() + .skip_while(|&&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r') + .copied() + .collect::>(); + if !trimmed.is_empty() && trimmed[0] != b'#' { + return None; + } + } + + lines.next().and_then(find_encoding_in_line) + } + + /// Decode source bytes to a string, handling PEP 263 encoding declarations + /// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding + /// declaration (matching CPython behavior). + /// Check if an encoding name is a UTF-8 variant after normalization. + /// Matches: utf-8, utf_8, utf8, UTF-8, etc. + #[cfg(feature = "parser")] + fn is_utf8_encoding(name: &str) -> bool { + let normalized: String = name.chars().filter(|&c| c != '-' && c != '_').collect(); + normalized.eq_ignore_ascii_case("utf8") + } + + #[cfg(feature = "parser")] + fn decode_source_bytes(source: &[u8], filename: &str, vm: &VirtualMachine) -> PyResult { + let has_bom = source.starts_with(b"\xef\xbb\xbf"); + let encoding = detect_source_encoding(source); + + let is_utf8 = encoding.as_deref().is_none_or(is_utf8_encoding); + + // Validate BOM + encoding combination + if has_bom && !is_utf8 { + return Err(vm.new_exception_msg( + vm.ctx.exceptions.syntax_error.to_owned(), + format!("encoding problem for '{filename}': utf-8").into(), + )); + } + + if is_utf8 { + let src = if has_bom { &source[3..] } else { source }; + match core::str::from_utf8(src) { + Ok(s) => Ok(s.to_owned()), + Err(e) => { + let bad_byte = src[e.valid_up_to()]; + let line = src[..e.valid_up_to()] + .iter() + .filter(|&&b| b == b'\n') + .count() + + 1; + Err(vm.new_exception_msg( + vm.ctx.exceptions.syntax_error.to_owned(), + format!( + "Non-UTF-8 code starting with '\\x{bad_byte:02x}' \ + on line {line}, but no encoding declared; \ + see https://peps.python.org/pep-0263/ for details \ + ({filename}, line {line})" + ) + .into(), + )) + } + } + } else { + // Use codec registry for non-UTF-8 encodings + let enc = encoding.as_deref().unwrap(); + let bytes_obj = vm.ctx.new_bytes(source.to_vec()); + let decoded = vm + .state + .codec_registry + .decode_text(bytes_obj.into(), enc, None, vm) + .map_err(|exc| { + if exc.fast_isinstance(vm.ctx.exceptions.lookup_error) { + vm.new_exception_msg( + vm.ctx.exceptions.syntax_error.to_owned(), + format!("unknown encoding for '{filename}': {enc}").into(), + ) + } else { + exc + } + })?; + Ok(decoded.to_string_lossy().into_owned()) + } + } + #[cfg(any(feature = "parser", feature = "compiler"))] #[pyfunction] fn compile(args: CompileArgs, vm: &VirtualMachine) -> PyResult { @@ -203,9 +342,8 @@ mod builtins { let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?; let source = source.borrow_bytes(); - // TODO: compiler::compile should probably get bytes - let source = core::str::from_utf8(&source) - .map_err(|e| vm.new_unicode_decode_error(e.to_string()))?; + let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?; + let source = source.as_str(); let flags = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?;