diff --git a/Lib/_pycodecs.py b/Lib/_pycodecs.py index 933d0e2ac71..4068bd56693 100644 --- a/Lib/_pycodecs.py +++ b/Lib/_pycodecs.py @@ -54,7 +54,8 @@ 'unicode_internal_encode', 'unicode_internal_decode', 'utf_16_ex_decode', 'escape_decode', 'charmap_decode', 'utf_7_encode', 'mbcs_encode', 'ascii_encode', 'utf_16_encode', 'raw_unicode_escape_encode', 'utf_8_encode', - 'utf_16_le_encode', 'utf_16_be_encode', 'utf_16_le_decode', 'utf_16_be_decode',] + 'utf_16_le_encode', 'utf_16_be_encode', 'utf_16_le_decode', 'utf_16_be_decode', + 'utf_32_ex_decode',] import sys import warnings @@ -100,12 +101,12 @@ def raw_unicode_escape_decode( data, errors='strict', final=False): res = ''.join(res) return res, len(data) -def utf_7_decode( data, errors='strict'): +def utf_7_decode( data, errors='strict', final=False): """None """ - res = PyUnicode_DecodeUTF7(data, len(data), errors) + res, consumed = PyUnicode_DecodeUTF7(data, len(data), errors, final) res = ''.join(res) - return res, len(data) + return res, consumed def unicode_escape_encode( obj, errors='strict'): """None @@ -225,6 +226,45 @@ def utf_16_ex_decode( data, errors='strict', byteorder=0, final=0): res = ''.join(res) return res, consumed, byteorder +def utf_32_ex_decode( data, errors='strict', byteorder=0, final=0): + """None + """ + if byteorder == 0: + if len(data) < 4: + if final and len(data): + if sys.byteorder == 'little': + bm = 'little' + else: + bm = 'big' + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, bm, final + ) + return ''.join(res), consumed, 0 + return '', 0, 0 + if data[0:4] == b'\xff\xfe\x00\x00': + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data[4:], len(data) - 4, errors, 'little', final + ) + return ''.join(res), consumed + 4, -1 + if data[0:4] == b'\x00\x00\xfe\xff': + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data[4:], len(data) - 4, errors, 'big', final + ) + return ''.join(res), consumed + 4, 1 + if sys.byteorder == 'little': + bm = 'little' + else: + bm = 'big' + res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, bm, final) + return ''.join(res), consumed, 0 + + if byteorder == -1: + res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'little', final) + return ''.join(res), consumed, -1 + + res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'big', final) + return ''.join(res), consumed, 1 + # XXX needs error messages when the input is invalid def escape_decode(data, errors='strict'): """None @@ -336,22 +376,12 @@ def utf_16_be_encode( obj, errors='strict'): res = bytes(res) return res, len(obj) -def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0): - """None - """ - consumed = len(data) - if final: - consumed = 0 +def utf_16_le_decode(data, errors='strict', final=0): res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final) res = ''.join(res) return res, consumed -def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0): - """None - """ - consumed = len(data) - if final: - consumed = 0 +def utf_16_be_decode(data, errors='strict', final=0): res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final) res = ''.join(res) return res, consumed @@ -379,34 +409,41 @@ def PyUnicode_EncodeUTF32(s, size, errors, byteorder='little'): # Add BOM for native encoding p += STORECHAR32(0xFEFF, bom) - if size == 0: - return [] - if byteorder == 'little': bom = 'little' elif byteorder == 'big': bom = 'big' - for c in s: - ch = ord(c) - # UTF-32 doesn't need surrogate pairs, each character is encoded directly - p += STORECHAR32(ch, bom) + pos = 0 + while pos < len(s): + ch = ord(s[pos]) + if 0xD800 <= ch <= 0xDFFF: + if errors == 'surrogatepass': + p += STORECHAR32(ch, bom) + pos += 1 + else: + res, pos = unicode_call_errorhandler( + errors, 'utf-32', 'surrogates not allowed', + s, pos, pos + 1, False) + for c in res: + p += STORECHAR32(ord(c), bom) + else: + p += STORECHAR32(ch, bom) + pos += 1 return p def utf_32_encode(obj, errors='strict'): """UTF-32 encoding with BOM.""" - res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'native') - res = bytes(res) - return res, len(obj) + encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'native') + return bytes(encoded), len(obj) def utf_32_le_encode(obj, errors='strict'): """UTF-32 little-endian encoding without BOM.""" - res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'little') - res = bytes(res) - return res, len(obj) + encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'little') + return bytes(encoded), len(obj) def utf_32_be_encode(obj, errors='strict'): @@ -421,26 +458,11 @@ def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final= if size == 0: return [], 0, 0 - if size % 4 != 0: - if not final: - # Incomplete data, return what we can decode - size = (size // 4) * 4 - if size == 0: - return [], 0, 0 - else: - # Final data must be complete - if errors == 'strict': - raise UnicodeDecodeError('utf-32', bytes(data), size - (size % 4), size, - 'truncated data') - elif errors == 'ignore': - size = (size // 4) * 4 - elif errors == 'replace': - size = (size // 4) * 4 - result = [] pos = 0 + aligned_size = (size // 4) * 4 - while pos + 3 < size: + while pos + 3 < aligned_size: if byteorder == 'little': ch = data[pos] | (data[pos+1] << 8) | (data[pos+2] << 16) | (data[pos+3] << 24) else: # big-endian @@ -454,10 +476,28 @@ def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final= elif errors == 'replace': result.append('\ufffd') # 'ignore' - skip this character + pos += 4 + elif 0xD800 <= ch <= 0xDFFF: + if errors == 'surrogatepass': + result.append(chr(ch)) + pos += 4 + else: + msg = 'code point in surrogate code point range(0xd800, 0xe000)' + res, pos = unicode_call_errorhandler( + errors, 'utf-32', msg, data, pos, pos + 4, True) + result.append(res) else: result.append(chr(ch)) + pos += 4 - pos += 4 + # Handle trailing incomplete bytes + if pos < size: + if final: + res, pos = unicode_call_errorhandler( + errors, 'utf-32', 'truncated data', + data, pos, size, True) + if res: + result.append(res) return result, pos, 0 @@ -519,7 +559,7 @@ def utf_32_be_decode(data, errors='strict', final=0): utf7_special = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, + 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, @@ -557,162 +597,214 @@ def ENCODE( ch, bits) : bits -= 6 return out, bits -def PyUnicode_DecodeUTF7(s, size, errors): +def _IS_BASE64(ch): + return (ord('A') <= ch <= ord('Z')) or (ord('a') <= ch <= ord('z')) or \ + (ord('0') <= ch <= ord('9')) or ch == ord('+') or ch == ord('/') - starts = s - errmsg = "" - inShift = 0 - bitsleft = 0 - charsleft = 0 - surrogate = 0 - p = [] - errorHandler = None - exc = None +def _FROM_BASE64(ch): + if ch == ord('+'): return 62 + if ch == ord('/'): return 63 + if ch >= ord('a'): return ch - 71 + if ch >= ord('A'): return ch - 65 + if ch >= ord('0'): return ch - ord('0') + 52 + return -1 - if (size == 0): - return '' +def _DECODE_DIRECT(ch): + return ch <= 127 and ch != ord('+') + +def PyUnicode_DecodeUTF7(s, size, errors, final=False): + if size == 0: + return [], 0 + + p = [] + inShift = False + base64bits = 0 + base64buffer = 0 + surrogate = 0 + startinpos = 0 + shiftOutStart = 0 i = 0 + while i < size: - - ch = bytes([s[i]]) - if (inShift): - if ((ch == b'-') or not B64CHAR(ch)): - inShift = 0 + ch = s[i] + if inShift: + if _IS_BASE64(ch): + base64buffer = (base64buffer << 6) | _FROM_BASE64(ch) + base64bits += 6 i += 1 - - while (bitsleft >= 16): - outCh = ((charsleft) >> (bitsleft-16)) & 0xffff - bitsleft -= 16 - - if (surrogate): - ## We have already generated an error for the high surrogate - ## so let's not bother seeing if the low surrogate is correct or not - surrogate = 0 - elif (0xDC00 <= (outCh) and (outCh) <= 0xDFFF): - ## This is a surrogate pair. Unfortunately we can't represent - ## it in a 16-bit character - surrogate = 1 - msg = "code pairs are not supported" - out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i) - p.append(out) - bitsleft = 0 - break + if base64bits >= 16: + outCh = (base64buffer >> (base64bits - 16)) & 0xffff + base64bits -= 16 + base64buffer &= (1 << base64bits) - 1 + if surrogate: + if 0xDC00 <= outCh <= 0xDFFF: + ch2 = 0x10000 + ((surrogate - 0xD800) << 10) + (outCh - 0xDC00) + p.append(chr(ch2)) + surrogate = 0 + continue + else: + p.append(chr(surrogate)) + surrogate = 0 + if 0xD800 <= outCh <= 0xDBFF: + surrogate = outCh else: - p.append(chr(outCh )) - #p += out - if (bitsleft >= 6): -## /* The shift sequence has a partial character in it. If -## bitsleft < 6 then we could just classify it as padding -## but that is not the case here */ - msg = "partial character in shift sequence" - out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i) - -## /* According to RFC2152 the remaining bits should be zero. We -## choose to signal an error/insert a replacement character -## here so indicate the potential of a misencoded character. */ - -## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ -## if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))): -## raise UnicodeDecodeError, "non-zero padding bits in shift sequence" - if (ch == b'-') : - if ((i < size) and (s[i] == '-')) : - p += '-' - inShift = 1 - - elif SPECIAL(ch, 0, 0) : - raise UnicodeDecodeError("unexpected special character") - - else: - p.append(chr(ord(ch))) + p.append(chr(outCh)) else: - charsleft = (charsleft << 6) | UB64(ch) - bitsleft += 6 - i += 1 -## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); - elif ( ch == b'+' ): + inShift = False + if base64bits > 0: + if base64bits >= 6: + i += 1 + errmsg = "partial character in shift sequence" + out, i = unicode_call_errorhandler( + errors, 'utf-7', errmsg, s, startinpos, i) + p.append(out) + continue + else: + if base64buffer != 0: + i += 1 + errmsg = "non-zero padding bits in shift sequence" + out, i = unicode_call_errorhandler( + errors, 'utf-7', errmsg, s, startinpos, i) + p.append(out) + continue + if surrogate and _DECODE_DIRECT(ch): + p.append(chr(surrogate)) + surrogate = 0 + if ch == ord('-'): + i += 1 + elif ch == ord('+'): startinpos = i i += 1 - if (i= 6 or (base64bits > 0 and base64buffer != 0): + errmsg = "unterminated shift sequence" + out, i = unicode_call_errorhandler( + errors, 'utf-7', errmsg, s, startinpos, size) + p.append(out) + + return p, size + +def _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): + c = ord(ch) if isinstance(ch, str) else ch + if c > 127: + return False + if utf7_special[c] == 0: + return True + if utf7_special[c] == 2: + return not encodeWhiteSpace + if utf7_special[c] == 3: + return not encodeSetO + return False def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors): - -# /* It might be possible to tighten this worst case */ inShift = False - i = 0 - bitsleft = 0 - charsleft = 0 + base64bits = 0 + base64buffer = 0 out = [] - for ch in s: - if (not inShift) : - if (ch == '+'): - out.append(b'+-') - elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)): - charsleft = ord(ch) - bitsleft = 16 - out.append(b'+') - p, bitsleft = ENCODE( charsleft, bitsleft) - out.append(p) - inShift = bitsleft > 0 + + for i, ch in enumerate(s): + ch_ord = ord(ch) + if inShift: + if _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): + # shifting out + if base64bits: + out.append(B64(base64buffer << (6 - base64bits))) + base64buffer = 0 + base64bits = 0 + inShift = False + if B64CHAR(ch) or ch == '-': + out.append(b'-') + out.append(bytes([ch_ord])) else: - out.append(bytes([ord(ch)])) + # encode character in base64 + if ch_ord >= 0x10000: + # split into surrogate pair + hi = 0xD800 | ((ch_ord - 0x10000) >> 10) + lo = 0xDC00 | ((ch_ord - 0x10000) & 0x3FF) + base64bits += 16 + base64buffer = (base64buffer << 16) | hi + while base64bits >= 6: + out.append(B64(base64buffer >> (base64bits - 6))) + base64bits -= 6 + base64buffer &= (1 << base64bits) - 1 if base64bits else 0 + ch_ord = lo + + base64bits += 16 + base64buffer = (base64buffer << 16) | ch_ord + while base64bits >= 6: + out.append(B64(base64buffer >> (base64bits - 6))) + base64bits -= 6 + base64buffer &= (1 << base64bits) - 1 if base64bits else 0 else: - if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)): - out.append(B64((charsleft) << (6-bitsleft))) - charsleft = 0 - bitsleft = 0 -## /* Characters not in the BASE64 set implicitly unshift the sequence -## so no '-' is required, except if the character is itself a '-' */ - if (B64CHAR(ch) or ch == '-'): - out.append(b'-') - inShift = False - out.append(bytes([ord(ch)])) + if ch == '+': + out.append(b'+-') + elif _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): + out.append(bytes([ch_ord])) else: - bitsleft += 16 - charsleft = (((charsleft) << 16) | ord(ch)) - p, bitsleft = ENCODE(charsleft, bitsleft) - out.append(p) -## /* If the next character is special then we dont' need to terminate -## the shift sequence. If the next character is not a BASE64 character -## or '-' then the shift sequence will be terminated implicitly and we -## don't have to insert a '-'. */ - - if (bitsleft == 0): - if (i + 1 < size): - ch2 = s[i+1] - - if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)): - pass - elif (B64CHAR(ch2) or ch2 == '-'): - out.append(b'-') - inShift = False - else: + out.append(b'+') + inShift = True + # encode character in base64 + if ch_ord >= 0x10000: + hi = 0xD800 | ((ch_ord - 0x10000) >> 10) + lo = 0xDC00 | ((ch_ord - 0x10000) & 0x3FF) + base64bits += 16 + base64buffer = (base64buffer << 16) | hi + while base64bits >= 6: + out.append(B64(base64buffer >> (base64bits - 6))) + base64bits -= 6 + base64buffer &= (1 << base64bits) - 1 if base64bits else 0 + ch_ord = lo + + base64bits += 16 + base64buffer = (base64buffer << 16) | ch_ord + while base64bits >= 6: + out.append(B64(base64buffer >> (base64bits - 6))) + base64bits -= 6 + base64buffer &= (1 << base64bits) - 1 if base64bits else 0 + + if base64bits == 0: + if i + 1 < size: + ch2 = s[i + 1] + if _ENCODE_DIRECT(ch2, encodeSetO, encodeWhiteSpace): + if B64CHAR(ch2) or ch2 == '-': + out.append(b'-') inShift = False else: out.append(b'-') inShift = False - i += 1 - - if (bitsleft): - out.append(B64(charsleft << (6-bitsleft) ) ) + + if base64bits: + out.append(B64(base64buffer << (6 - base64bits))) + if inShift: out.append(b'-') return out @@ -879,55 +971,66 @@ def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=Tru ilo = 1 while (q < len(s)): - + #/* remaining bytes at the end? (size should be even) */ - if (len(s)-q<2): + if (len(s) - q < 2): if not final: break - errmsg = "truncated data" - startinpos = q - endinpos = len(s) - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) -# /* The remaining input chars are ignored if the callback -## chooses to skip the input */ - + res, q = unicode_call_errorhandler( + errors, 'utf-16', "truncated data", + s, q, len(s), True) + p.append(res) + break + ch = (s[q+ihi] << 8) | s[q+ilo] - q += 2 - + if (ch < 0xD800 or ch > 0xDFFF): p.append(chr(ch)) - continue - - #/* UTF-16 code pair: */ - if (q >= len(s)): - errmsg = "unexpected end of data" - startinpos = q-2 - endinpos = len(s) - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) - - if (0xD800 <= ch and ch <= 0xDBFF): - ch2 = (s[q+ihi] << 8) | s[q+ilo] q += 2 - if (0xDC00 <= ch2 and ch2 <= 0xDFFF): - #ifndef Py_UNICODE_WIDE - if sys.maxunicode < 65536: - p += [chr(ch), chr(ch2)] + continue + + #/* UTF-16 code pair: high surrogate */ + if (0xD800 <= ch <= 0xDBFF): + if (q + 4 <= len(s)): + ch2 = (s[q+2+ihi] << 8) | s[q+2+ilo] + if (0xDC00 <= ch2 <= 0xDFFF): + # Valid surrogate pair - always assemble + p.append(chr((((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000)) + q += 4 + continue else: - p.append(chr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)) - #endif + # High surrogate followed by non-low-surrogate + if errors == 'surrogatepass': + p.append(chr(ch)) + q += 2 + continue + res, q = unicode_call_errorhandler( + errors, 'utf-16', "illegal UTF-16 surrogate", + s, q, q + 2, True) + p.append(res) + else: + # High surrogate at end of data + if not final: + break + if errors == 'surrogatepass': + p.append(chr(ch)) + q += 2 + continue + res, q = unicode_call_errorhandler( + errors, 'utf-16', "unexpected end of data", + s, q, len(s), True) + p.append(res) + else: + # Low surrogate without preceding high surrogate + if errors == 'surrogatepass': + p.append(chr(ch)) + q += 2 continue + res, q = unicode_call_errorhandler( + errors, 'utf-16', "illegal encoding", + s, q, q + 2, True) + p.append(res) - else: - errmsg = "illegal UTF-16 surrogate" - startinpos = q-4 - endinpos = startinpos+2 - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) - - errmsg = "illegal encoding" - startinpos = q-2 - endinpos = startinpos+2 - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) - return p, q, bo # moved out of local scope, especially because it didn't @@ -953,25 +1056,40 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'): bom = sys.byteorder p += STORECHAR(0xFEFF, bom) - if (size == 0): - return [] - if (byteorder == 'little' ): bom = 'little' elif (byteorder == 'big'): bom = 'big' - - for c in s: - ch = ord(c) - ch2 = 0 - if (ch >= 0x10000) : - ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF) - ch = 0xD800 | ((ch-0x10000) >> 10) - - p += STORECHAR(ch, bom) - if (ch2): - p += STORECHAR(ch2, bom) + pos = 0 + while pos < len(s): + ch = ord(s[pos]) + if 0xD800 <= ch <= 0xDFFF: + if errors == 'surrogatepass': + p += STORECHAR(ch, bom) + pos += 1 + else: + res, pos = unicode_call_errorhandler( + errors, 'utf-16', 'surrogates not allowed', + s, pos, pos + 1, False) + for c in res: + cp = ord(c) + cp2 = 0 + if cp >= 0x10000: + cp2 = 0xDC00 | ((cp - 0x10000) & 0x3FF) + cp = 0xD800 | ((cp - 0x10000) >> 10) + p += STORECHAR(cp, bom) + if cp2: + p += STORECHAR(cp2, bom) + else: + ch2 = 0 + if ch >= 0x10000: + ch2 = 0xDC00 | ((ch - 0x10000) & 0x3FF) + ch = 0xD800 | ((ch - 0x10000) >> 10) + p += STORECHAR(ch, bom) + if ch2: + p += STORECHAR(ch2, bom) + pos += 1 return p @@ -991,7 +1109,7 @@ def unicode_call_errorhandler(errors, encoding, else: exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason) res = errorHandler(exceptionObject) - if isinstance(res, tuple) and isinstance(res[0], str) and isinstance(res[1], int): + if isinstance(res, tuple) and isinstance(res[0], (str, bytes)) and isinstance(res[1], int): newpos = res[1] if (newpos < 0): newpos = len(input) + newpos @@ -1041,7 +1159,11 @@ def unicode_encode_ucs1(p, size, errors, limit): while collend < len(p) and ord(p[collend]) >= limit: collend += 1 x = unicode_call_errorhandler(errors, encoding, reason, p, collstart, collend, False) - res += x[0].encode() + replacement = x[0] + if isinstance(replacement, bytes): + res += replacement + else: + res += replacement.encode() pos = x[1] return res @@ -1258,12 +1380,16 @@ def PyUnicode_EncodeCharmap(p, size, mapping='latin-1', errors='strict'): except KeyError: x = unicode_call_errorhandler(errors, "charmap", "character maps to ", p, inpos, inpos+1, False) - try: - for y in x[0]: - res += charmapencode_output(ord(y), mapping) - except KeyError: - raise UnicodeEncodeError("charmap", p, inpos, inpos+1, - "character maps to ") + replacement = x[0] + if isinstance(replacement, bytes): + res += list(replacement) + else: + try: + for y in replacement: + res += charmapencode_output(ord(y), mapping) + except KeyError: + raise UnicodeEncodeError("charmap", p, inpos, inpos+1, + "character maps to ") inpos += 1 return res diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index f9075b8f0d9..21c4ce14852 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -172,3 +172,23 @@ def _alias_mbcs(encoding): pass codecs.register(_alias_mbcs) + + from ._win_cp_codecs import create_win32_code_page_codec + + def win32_code_page_search_function(encoding): + encoding = encoding.lower() + if not encoding.startswith('cp'): + return None + try: + cp = int(encoding[2:]) + except ValueError: + return None + # Test if the code page is supported + try: + codecs.code_page_encode(cp, 'x') + except (OverflowError, OSError): + return None + + return create_win32_code_page_codec(cp) + + codecs.register(win32_code_page_search_function) diff --git a/Lib/encodings/_win_cp_codecs.py b/Lib/encodings/_win_cp_codecs.py new file mode 100644 index 00000000000..4f8eb886794 --- /dev/null +++ b/Lib/encodings/_win_cp_codecs.py @@ -0,0 +1,36 @@ +import codecs + +def create_win32_code_page_codec(cp): + from codecs import code_page_encode, code_page_decode + + def encode(input, errors='strict'): + return code_page_encode(cp, input, errors) + + def decode(input, errors='strict'): + return code_page_decode(cp, input, errors, True) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return code_page_encode(cp, input, self.errors)[0] + + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return code_page_decode(cp, input, errors, final) + + class StreamWriter(codecs.StreamWriter): + def encode(self, input, errors='strict'): + return code_page_encode(cp, input, errors) + + class StreamReader(codecs.StreamReader): + def decode(self, input, errors, final): + return code_page_decode(cp, input, errors, final) + + return codecs.CodecInfo( + name=f'cp{cp}', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_bz2.py b/Lib/test/test_bz2.py index 26b5e79d337..148d8f98c79 100644 --- a/Lib/test/test_bz2.py +++ b/Lib/test/test_bz2.py @@ -730,7 +730,7 @@ def testOpenBytesFilename(self): self.assertEqual(f.read(), self.DATA) self.assertEqual(f.name, str_filename) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: != 'Z:\\TEMP\\tmphoipjcen' def testOpenPathLikeFilename(self): filename = FakePath(self.filename) with BZ2File(filename, "wb") as f: @@ -1189,7 +1189,6 @@ def test_encoding_error_handler(self): as f: self.assertEqual(f.read(), "foobar") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_newline(self): # Test with explicit newline (universal newline mode disabled). text = self.TEXT.decode("ascii") diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 3d64c97bd16..232121b6210 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -465,7 +465,6 @@ class UTF32Test(ReadTest, unittest.TestCase): b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_only_one_bom(self): _,_,reader,writer = codecs.lookup(self.encoding) # encode some stream @@ -481,7 +480,6 @@ def test_only_one_bom(self): f = reader(s) self.assertEqual(f.read(), "spamspam") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_badbom(self): s = io.BytesIO(4*b"\xff") f = codecs.getreader(self.encoding)(s) @@ -491,7 +489,6 @@ def test_badbom(self): f = codecs.getreader(self.encoding)(s) self.assertRaises(UnicodeDecodeError, f.read) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff\U00010000", @@ -523,7 +520,6 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_handlers(self): self.assertEqual(('\ufffd', 1), codecs.utf_32_decode(b'\x01', 'replace', True)) @@ -534,7 +530,6 @@ def test_errors(self): self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, b"\xff", "strict", True) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_decoder_state(self): self.check_state_handling_decode(self.encoding, "spamspam", self.spamle) @@ -551,35 +546,24 @@ def test_issue8941(self): self.assertEqual('\U00010000' * 1024, codecs.utf_32_decode(encoded_be)[0]) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_bug1098990_a(self): return super().test_bug1098990_a() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_bug1098990_b(self): return super().test_bug1098990_b() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_bug1175396(self): return super().test_bug1175396() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_mixed_readline_and_read(self): return super().test_mixed_readline_and_read() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_readline(self): return super().test_readline() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_readlinequeue(self): return super().test_readlinequeue() @@ -636,10 +620,6 @@ def test_issue8941(self): self.assertEqual('\U00010000' * 1024, codecs.utf_32_le_decode(encoded)[0]) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - @@ -693,10 +673,6 @@ def test_issue8941(self): self.assertEqual('\U00010000' * 1024, codecs.utf_32_be_decode(encoded)[0]) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - @@ -739,7 +715,6 @@ def test_badbom(self): f = codecs.getreader(self.encoding)(s) self.assertRaises(UnicodeDecodeError, f.read) - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'utf-16' codec can't decode bytes in position 0-1: unexpected end of data def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff\U00010000", @@ -761,7 +736,6 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON; IndexError: index out of range def test_handlers(self): self.assertEqual(('\ufffd', 1), codecs.utf_16_decode(b'\x01', 'replace', True)) @@ -805,11 +779,6 @@ def test_invalid_modes(self): self.assertIn("can't have text and binary mode at once", str(cm.exception)) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - - @unittest.expectedFailure # TODO: RUSTPYTHON; IndexError: index out of range def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() @@ -819,7 +788,6 @@ class UTF16LETest(ReadTest, unittest.TestCase): encoding = "utf-16-le" ill_formed_sequence = b"\x80\xdc" - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'utf-16' codec can't decode bytes in position 0-1: unexpected end of data def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff\U00010000", @@ -839,7 +807,6 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_errors(self): tests = [ (b'\xff', '\ufffd'), @@ -861,11 +828,6 @@ def test_nonbmp(self): self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), "\U00010203") - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - - @unittest.expectedFailure # TODO: RUSTPYTHON; IndexError: index out of range def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() @@ -874,7 +836,6 @@ class UTF16BETest(ReadTest, unittest.TestCase): encoding = "utf-16-be" ill_formed_sequence = b"\xdc\x80" - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'utf-16' codec can't decode bytes in position 0-1: unexpected end of data def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff\U00010000", @@ -894,7 +855,6 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_errors(self): tests = [ (b'\xff', '\ufffd'), @@ -916,11 +876,6 @@ def test_nonbmp(self): self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), "\U00010203") - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_lone_surrogates(self): - return super().test_lone_surrogates() - - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'utf-16' codec can't decode bytes in position 0-1: unexpected end of data def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() @@ -970,7 +925,6 @@ def test_decode_error(self): self.assertEqual(data.decode(self.encoding, error_handler), expected) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_lone_surrogates(self): super().test_lone_surrogates() # not sure if this is making sense for @@ -1023,7 +977,6 @@ def test_incremental_errors(self): class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" - @unittest.expectedFailure # TODO: RUSTPYTHON def test_ascii(self): # Set D (directly encoded characters) set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -1050,7 +1003,6 @@ def test_ascii(self): b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') - @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: expected at least 5 arguments, got 1 def test_partial(self): self.check_partial( 'a+-b\x00c\x80d\u0100e\U00010000f', @@ -1090,7 +1042,6 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_errors(self): tests = [ (b'\xffb', '\ufffdb'), @@ -1121,7 +1072,6 @@ def test_errors(self): raw, 'strict', True) self.assertEqual(raw.decode('utf-7', 'replace'), expected) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_nonbmp(self): self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') @@ -1137,7 +1087,6 @@ def test_nonbmp(self): self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), '\u20ac\u20ac\U000104A0') - @unittest.expectedFailure # TODO: RUSTPYTHON def test_lone_surrogates(self): tests = [ (b'a+2AE-b', 'a\ud801b'), @@ -1158,15 +1107,9 @@ def test_lone_surrogates(self): with self.subTest(raw=raw): self.assertEqual(raw.decode('utf-7', 'replace'), expected) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_bug1175396(self): - return super().test_bug1175396() - - @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: expected at least 5 arguments, got 1 def test_readline(self): return super().test_readline() - @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: utf_7_decode() takes from 1 to 2 positional arguments but 3 were given def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() @@ -3062,7 +3005,6 @@ def test_latin1(self): class BomTest(unittest.TestCase): - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_seek0(self): data = "1234567890" tests = ("utf-16", @@ -3457,7 +3399,6 @@ def test_invalid_code_page(self): self.assertRaises(OSError, codecs.code_page_encode, 123, 'a') self.assertRaises(OSError, codecs.code_page_decode, 123, b'a') - @unittest.expectedFailure # TODO: RUSTPYTHON def test_code_page_name(self): self.assertRaisesRegex(UnicodeEncodeError, 'cp932', codecs.code_page_encode, 932, '\xff') @@ -3524,7 +3465,7 @@ def check_encode(self, cp, tests): self.assertRaises(UnicodeEncodeError, text.encode, f'cp{cp}', errors) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON def test_cp932(self): self.check_encode(932, ( ('abc', 'strict', b'abc'), @@ -3559,7 +3500,6 @@ def test_cp932(self): (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), )) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_cp1252(self): self.check_encode(1252, ( ('abc', 'strict', b'abc'), @@ -3578,7 +3518,6 @@ def test_cp1252(self): (b'\xff', 'strict', '\xff'), )) - @unittest.expectedFailureIfWindows("TODO: RUSTPYTHON") def test_cp708(self): self.check_encode(708, ( ('abc2%', 'strict', b'abc2%'), @@ -3608,7 +3547,6 @@ def test_cp708(self): (b'[\xa0]', 'surrogatepass', None), )) - @unittest.expectedFailureIfWindows("TODO: RUSTPYTHON") def test_cp20106(self): self.check_encode(20106, ( ('abc', 'strict', b'abc'), @@ -3633,7 +3571,7 @@ def test_cp20106(self): (b'(\xbf)', 'surrogatepass', None), )) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON # TODO: RUSTPYTHON def test_cp_utf7(self): cp = 65000 self.check_encode(cp, ( @@ -3654,7 +3592,6 @@ def test_cp_utf7(self): (b'[\xff]', 'strict', '[\xff]'), )) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_multibyte_encoding(self): self.check_decode(932, ( (b'\x84\xe9\x80', 'ignore', '\u9a3e'), @@ -3688,7 +3625,6 @@ def test_code_page_decode_flags(self): self.assertEqual(codecs.code_page_decode(42, b'abc'), ('\uf061\uf062\uf063', 3)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_incremental(self): decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) self.assertEqual(decoded, ('', 0)) diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py index 1a6ef3cd275..b340ef7ed16 100644 --- a/Lib/test/test_fileinput.py +++ b/Lib/test/test_fileinput.py @@ -980,8 +980,6 @@ def check(errors, expected_lines): check('replace', ['\ufffdabc']) check('backslashreplace', ['\\x80abc']) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_modes(self): with open(TESTFN, 'wb') as f: # UTF-7 is a convenient, seldom used encoding diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py index 4a8813c4da1..ccbacc7c19b 100644 --- a/Lib/test/test_gzip.py +++ b/Lib/test/test_gzip.py @@ -1036,7 +1036,6 @@ def test_encoding_error_handler(self): as f: self.assertEqual(f.read(), "foobar") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_newline(self): # Test with explicit newline (universal newline mode disabled). uncompressed = data1.decode("ascii") * 50 diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 5fd011360f0..ba54349f41d 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -445,9 +445,25 @@ def test_invalid_operations(self): self.assertRaises(exc, fp.seek, 1, self.SEEK_CUR) self.assertRaises(exc, fp.seek, -1, self.SEEK_END) - @unittest.skipIf( - support.is_emscripten, "fstat() of a pipe fd is not supported" - ) + @support.cpython_only + def test_startup_optimization(self): + # gh-132952: Test that `io` is not imported at startup and that the + # __module__ of UnsupportedOperation is set to "io". + assert_python_ok("-S", "-c", textwrap.dedent( + """ + import sys + assert "io" not in sys.modules + try: + sys.stdin.truncate() + except Exception as e: + typ = type(e) + assert typ.__module__ == "io", (typ, typ.__module__) + assert typ.__name__ == "UnsupportedOperation", (typ, typ.__name__) + else: + raise AssertionError("Expected UnsupportedOperation") + """ + )) + @unittest.skipUnless(hasattr(os, "pipe"), "requires os.pipe()") def test_optional_abilities(self): # Test for OSError when optional APIs are not supported @@ -501,57 +517,65 @@ class UnseekableWriter(self.MockUnseekableIO): (text_reader, "r"), (text_writer, "w"), (self.BytesIO, "rws"), (self.StringIO, "rws"), ) - for [test, abilities] in tests: - with self.subTest(test), test() as obj: - readable = "r" in abilities - self.assertEqual(obj.readable(), readable) - writable = "w" in abilities - self.assertEqual(obj.writable(), writable) - - if isinstance(obj, self.TextIOBase): - data = "3" - elif isinstance(obj, (self.BufferedIOBase, self.RawIOBase)): - data = b"3" - else: - self.fail("Unknown base class") - if "f" in abilities: - obj.fileno() - else: - self.assertRaises(OSError, obj.fileno) + def do_test(test, obj, abilities): + readable = "r" in abilities + self.assertEqual(obj.readable(), readable) + writable = "w" in abilities + self.assertEqual(obj.writable(), writable) - if readable: - obj.read(1) - obj.read() - else: - self.assertRaises(OSError, obj.read, 1) - self.assertRaises(OSError, obj.read) + if isinstance(obj, self.TextIOBase): + data = "3" + elif isinstance(obj, (self.BufferedIOBase, self.RawIOBase)): + data = b"3" + else: + self.fail("Unknown base class") - if writable: - obj.write(data) - else: - self.assertRaises(OSError, obj.write, data) - - if sys.platform.startswith("win") and test in ( - pipe_reader, pipe_writer): - # Pipes seem to appear as seekable on Windows - continue - seekable = "s" in abilities - self.assertEqual(obj.seekable(), seekable) - - if seekable: - obj.tell() - obj.seek(0) - else: - self.assertRaises(OSError, obj.tell) - self.assertRaises(OSError, obj.seek, 0) + if "f" in abilities: + obj.fileno() + else: + self.assertRaises(OSError, obj.fileno) + + if readable: + obj.read(1) + obj.read() + else: + self.assertRaises(OSError, obj.read, 1) + self.assertRaises(OSError, obj.read) + + if writable: + obj.write(data) + else: + self.assertRaises(OSError, obj.write, data) + + if sys.platform.startswith("win") and test in ( + pipe_reader, pipe_writer): + # Pipes seem to appear as seekable on Windows + return + seekable = "s" in abilities + self.assertEqual(obj.seekable(), seekable) + + if seekable: + obj.tell() + obj.seek(0) + else: + self.assertRaises(OSError, obj.tell) + self.assertRaises(OSError, obj.seek, 0) + + if writable and seekable: + obj.truncate() + obj.truncate(0) + else: + self.assertRaises(OSError, obj.truncate) + self.assertRaises(OSError, obj.truncate, 0) + + for [test, abilities] in tests: + with self.subTest(test): + if test == pipe_writer and not threading_helper.can_start_thread: + self.skipTest("Need threads") + with test() as obj: + do_test(test, obj, abilities) - if writable and seekable: - obj.truncate() - obj.truncate(0) - else: - self.assertRaises(OSError, obj.truncate) - self.assertRaises(OSError, obj.truncate, 0) def test_open_handles_NUL_chars(self): fn_with_NUL = 'foo\0bar' @@ -781,7 +805,7 @@ def test_closefd_attr(self): self.assertEqual(file.buffer.raw.closefd, False) @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; cyclic GC not supported, causes file locking') + @unittest.skipIf(sys.platform == "win32", "TODO: RUSTPYTHON; cyclic GC not supported, causes file locking") def test_garbage_collection(self): # FileIO objects are collected, and collecting them flushes # all data to disk. @@ -896,7 +920,7 @@ def test_types_have_dict(self): self.BytesIO() ) for obj in test: - self.assertTrue(hasattr(obj, "__dict__")) + self.assertHasAttr(obj, "__dict__") def test_opener(self): with self.open(os_helper.TESTFN, "w", encoding="utf-8") as f: @@ -1090,7 +1114,7 @@ def reader(file, barrier): class CIOTest(IOTest): - @unittest.expectedFailure # TODO: RUSTPYTHON; cyclic gc + @unittest.expectedFailure # TODO: RUSTPYTHON; cyclic gc def test_IOBase_finalize(self): # Issue #12149: segmentation fault on _PyIOBase_finalize when both a # class which inherits IOBase and an object of this class are caught @@ -1109,10 +1133,6 @@ def close(self): support.gc_collect() self.assertIsNone(wr(), wr) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - def test_destructor(self): - return super().test_destructor() - @support.cpython_only class TestIOCTypes(unittest.TestCase): def setUp(self): @@ -1147,7 +1167,7 @@ def test_class_hierarchy(self): def check_subs(types, base): for tp in types: with self.subTest(tp=tp, base=base): - self.assertTrue(issubclass(tp, base)) + self.assertIsSubclass(tp, base) def recursive_check(d): for k, v in d.items(): @@ -1804,7 +1824,7 @@ def test_misbehaved_io_read(self): self.assertRaises(OSError, bufio.read, 10) @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; cyclic GC not supported, causes file locking') + @unittest.skipIf(sys.platform == "win32", "TODO: RUSTPYTHON; cyclic GC not supported, causes file locking") def test_garbage_collection(self): # C BufferedReader objects are collected. # The Python version has __del__, so it ends into gc.garbage instead @@ -1839,14 +1859,6 @@ def test_bad_readinto_type(self): bufio.readline() self.assertIsInstance(cm.exception.__cause__, TypeError) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_pickling_subclass(self): - return super().test_pickling_subclass() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'NoneType' object has no attribute 'exc_type' - def test_error_through_destructor(self): - return super().test_error_through_destructor() - class PyBufferedReaderTest(BufferedReaderTest): tp = pyio.BufferedReader @@ -1910,7 +1922,7 @@ def test_write_overflow(self): flushed = b"".join(writer._write_stack) # At least (total - 8) bytes were implicitly flushed, perhaps more # depending on the implementation. - self.assertTrue(flushed.startswith(contents[:-8]), flushed) + self.assertStartsWith(flushed, contents[:-8]) def check_writes(self, intermediate_func): # Lots of writes, test the flushed output is as expected. @@ -1980,7 +1992,7 @@ def test_write_non_blocking(self): self.assertEqual(bufio.write(b"ABCDEFGHI"), 9) s = raw.pop_written() # Previously buffered bytes were flushed - self.assertTrue(s.startswith(b"01234567A"), s) + self.assertStartsWith(s, b"01234567A") def test_write_and_rewind(self): raw = self.BytesIO() @@ -2162,7 +2174,7 @@ def test_initialization(self): self.assertRaises(ValueError, bufio.write, b"def") @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; cyclic GC not supported, causes file locking') + @unittest.skipIf(sys.platform == "win32", "TODO: RUSTPYTHON; cyclic GC not supported, causes file locking") def test_garbage_collection(self): # C BufferedWriter objects are collected, and collecting them flushes # all data to disk. @@ -2185,13 +2197,6 @@ def test_args_error(self): with self.assertRaisesRegex(TypeError, "BufferedWriter"): self.tp(self.BytesIO(), 1024, 1024, 1024) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_pickling_subclass(self): - return super().test_pickling_subclass() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'NoneType' object has no attribute 'exc_type' - def test_error_through_destructor(self): - return super().test_error_through_destructor() class PyBufferedWriterTest(BufferedWriterTest): tp = pyio.BufferedWriter @@ -2285,7 +2290,7 @@ def test_write(self): def test_peek(self): pair = self.tp(self.BytesIO(b"abcdef"), self.MockRawIO()) - self.assertTrue(pair.peek(3).startswith(b"abc")) + self.assertStartsWith(pair.peek(3), b"abc") self.assertEqual(pair.read(3), b"abc") def test_readable(self): @@ -2670,7 +2675,7 @@ class CBufferedRandomTest(BufferedRandomTest, SizeofTest): tp = io.BufferedRandom @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; cyclic GC not supported, causes file locking') + @unittest.skipIf(sys.platform == "win32", "TODO: RUSTPYTHON; cyclic GC not supported, causes file locking") def test_garbage_collection(self): CBufferedReaderTest.test_garbage_collection(self) CBufferedWriterTest.test_garbage_collection(self) @@ -2680,14 +2685,6 @@ def test_args_error(self): with self.assertRaisesRegex(TypeError, "BufferedRandom"): self.tp(self.BytesIO(), 1024, 1024, 1024) - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_pickling_subclass(self): - return super().test_pickling_subclass() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'NoneType' object has no attribute 'exc_type' - def test_error_through_destructor(self): - return super().test_error_through_destructor() - class PyBufferedRandomTest(BufferedRandomTest): tp = pyio.BufferedRandom @@ -2847,7 +2844,6 @@ def setUp(self): def tearDown(self): os_helper.unlink(os_helper.TESTFN) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: UnicodeEncodeError not raised def test_constructor(self): r = self.BytesIO(b"\xc3\xa9\n\n") b = self.BufferedReader(r, 1000) @@ -2998,14 +2994,11 @@ def test_reconfigure_line_buffering(self): @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_default_encoding(self): - old_environ = dict(os.environ) - try: + with os_helper.EnvironmentVarGuard() as env: # try to get a user preferred encoding different than the current # locale encoding to check that TextIOWrapper() uses the current # locale encoding and not the user preferred encoding - for key in ('LC_ALL', 'LANG', 'LC_CTYPE'): - if key in os.environ: - del os.environ[key] + env.unset('LC_ALL', 'LANG', 'LC_CTYPE') current_locale_encoding = locale.getencoding() b = self.BytesIO() @@ -3013,9 +3006,6 @@ def test_default_encoding(self): warnings.simplefilter("ignore", EncodingWarning) t = self.TextIOWrapper(b) self.assertEqual(t.encoding, current_locale_encoding) - finally: - os.environ.clear() - os.environ.update(old_environ) def test_encoding(self): # Check the encoding attribute is always set, and valid @@ -3070,7 +3060,6 @@ def test_encoding_errors_writing(self): t.flush() self.assertEqual(b.getvalue(), b"abc?def\n") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_newlines(self): input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ] @@ -3389,7 +3378,6 @@ def test_seek_with_encoder_state(self): self.assertEqual(f.readline(), "\u00e6\u0300\u0300") f.close() - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_encoded_writes(self): data = "1234567890" tests = ("utf-16", @@ -3825,7 +3813,7 @@ def __del__(self): """.format(iomod=iomod, kwargs=kwargs) return assert_python_ok("-c", code) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 'LookupError: unknown encoding: ascii' not found in "Exception ignored in: \nAttributeError: 'NoneType' object has no attribute 'TextIOWrapper'\n" + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError during module teardown in __del__ def test_create_at_shutdown_without_encoding(self): rc, out, err = self._check_create_at_shutdown() if err: @@ -3835,7 +3823,7 @@ def test_create_at_shutdown_without_encoding(self): else: self.assertEqual("ok", out.decode().strip()) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b"Exception ignored in: \nAttributeError: 'NoneType' object has no attribute 'TextIOWrapper'\n" is not false + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError during module teardown in __del__ def test_create_at_shutdown_with_encoding(self): rc, out, err = self._check_create_at_shutdown(encoding='utf-8', errors='strict') @@ -4083,6 +4071,24 @@ def __setstate__(slf, state): self.assertEqual(newtxt.tag, 'ham') del MyTextIO + # TODO: RUSTPYTHON; TypeError: a bytes-like object is required, not 'NoneType' + @unittest.expectedFailure + @unittest.skipUnless(hasattr(os, "pipe"), "requires os.pipe()") + def test_read_non_blocking(self): + import os + r, w = os.pipe() + try: + os.set_blocking(r, False) + with self.io.open(r, 'rt') as textfile: + r = None + # Nothing has been written so a non-blocking read raises a BlockingIOError exception. + with self.assertRaises(BlockingIOError): + textfile.read() + finally: + if r is not None: + os.close(r) + os.close(w) + class MemviewBytesIO(io.BytesIO): '''A BytesIO object whose read method returns memoryviews @@ -4107,7 +4113,6 @@ class CTextIOWrapperTest(TextIOWrapperTest): io = io shutdown_error = "LookupError: unknown encoding: ascii" - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: ValueError not raised by read def test_initialization(self): r = self.BytesIO(b"\xc3\xa9\n\n") b = self.BufferedReader(r, 1000) @@ -4119,7 +4124,7 @@ def test_initialization(self): self.assertRaises(Exception, repr, t) @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: filter ('', ResourceWarning) did not catch any warning - @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; cyclic GC not supported, causes file locking') + @unittest.skipIf(sys.platform == "win32", "TODO: RUSTPYTHON; cyclic GC not supported, causes file locking") def test_garbage_collection(self): # C TextIOWrapper objects are collected, and collecting them flushes # all data to disk. @@ -4183,7 +4188,6 @@ def write(self, data): t.write("x"*chunk_size) self.assertEqual([b"abcdef", b"ghi", b"x"*chunk_size], buf._write_stack) - @unittest.expectedFailure # TODO: RUSTPYTHON; RuntimeError: reentrant call inside textio def test_issue119506(self): chunk_size = 8192 @@ -4206,86 +4210,33 @@ def write(self, data): self.assertEqual([b"abcdef", b"middle", b"g"*chunk_size], buf._write_stack) - # TODO: RUSTPYTHON; euc_jis_2004 encoding not supported - @unittest.expectedFailure - def test_seek_with_encoder_state(self): - return super().test_seek_with_encoder_state() - - @unittest.expectedFailure # TODO: RUSTPYTHON - def test_pickling_subclass(self): - return super().test_pickling_subclass() - - @unittest.expectedFailure # TODO: RUSTPYTHON; + - def test_reconfigure_newline(self): - return super().test_reconfigure_newline() - - @unittest.expectedFailure # TODO: RUSTPYTHON; + ['AAA\nBB\x00B\nCCC\r', 'DDD\r', 'EEE\r', '\nFFF\r', '\nGGG'] - def test_newlines_input(self): - return super().test_newlines_input() - - @unittest.expectedFailure # TODO: RUSTPYTHON; + strict - def test_reconfigure_defaults(self): - return super().test_reconfigure_defaults() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: LookupError not raised - def test_non_text_encoding_codecs_are_rejected(self): - return super().test_non_text_encoding_codecs_are_rejected() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: Regex didn't match: "<(_io\\.)?TextIOWrapper name='dummy' mode='r' encoding='utf-8'>" not found in "<_io.TextIOWrapper name='dummy' encoding='utf-8'>" - def test_repr(self): - return super().test_repr() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: RuntimeError not raised - def test_recursive_repr(self): - return super().test_recursive_repr() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: UnicodeEncodeError not raised - def test_reconfigure_errors(self): - return super().test_reconfigure_errors() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: UnsupportedOperation not raised - def test_reconfigure_encoding_read(self): - return super().test_reconfigure_encoding_read() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'' != b'1' - def test_reconfigure_write_through(self): - return super().test_reconfigure_write_through() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'' != b'AB\nC' - def test_reconfigure_line_buffering(self): - return super().test_reconfigure_line_buffering() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'' != b'abc\xe9\n' - def test_reconfigure_write(self): - return super().test_reconfigure_write() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'\xef\xbb\xbfaaa\xef\xbb\xbfxxx' != b'\xef\xbb\xbfaaaxxx' - def test_append_bom(self): - return super().test_append_bom() - - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'foo\n\xef\xbb\xbf\xc3\xa9\n' != b'foo\n\xc3\xa9\n' - def test_reconfigure_write_fromascii(self): - return super().test_reconfigure_write_fromascii() + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'NoneType' object has no attribute 'closed' + def test_issue142594(self): + wrapper = None + detached = False + class ReentrantRawIO(self.RawIOBase): + @property + def closed(self): + nonlocal detached + if wrapper is not None and not detached: + detached = True + wrapper.detach() + return False - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'NoneType' object has no attribute 'exc_type' - def test_error_through_destructor(self): - return super().test_error_through_destructor() + raw = ReentrantRawIO() + wrapper = self.TextIOWrapper(raw) + wrapper.close() # should not crash - @unittest.expectedFailure # TODO: RUSTPYTHON; LookupError: unknown encoding: locale - def test_reconfigure_locale(self): - return super().test_reconfigure_locale() + @unittest.expectedFailure # TODO: RUSTPYTHON; LookupError: unknown encoding: euc_jis_2004 + def test_seek_with_encoder_state(self): + return super().test_seek_with_encoder_state() class PyTextIOWrapperTest(TextIOWrapperTest): io = pyio shutdown_error = "LookupError: unknown encoding: ascii" - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: ValueError not raised - def test_constructor(self): - return super().test_constructor() - - # TODO: RUSTPYTHON; euc_jis_2004 encoding not supported - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; LookupError: unknown encoding: euc_jis_2004 def test_seek_with_encoder_state(self): return super().test_seek_with_encoder_state() @@ -4367,7 +4318,6 @@ def _decode_bytewise(s): self.assertEqual(decoder.decode(input), "abc") self.assertEqual(decoder.newlines, None) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'codecs' has no attribute 'utf_32_ex_decode'. Did you mean: 'utf_16_ex_decode'? def test_newline_decoder(self): encodings = ( # None meaning the IncrementalNewlineDecoder takes unicode input @@ -4465,9 +4415,6 @@ def test_removed_u_mode(self): self.open(os_helper.TESTFN, mode) self.assertIn('invalid mode', str(cm.exception)) - @unittest.skipIf( - support.is_emscripten, "fstat() of a pipe fd is not supported" - ) @unittest.skipUnless(hasattr(os, "pipe"), "requires os.pipe()") def test_open_pipe_with_append(self): # bpo-27805: Ignore ESPIPE from lseek() in open(). @@ -4529,7 +4476,7 @@ def test_io_after_close(self): self.assertRaises(ValueError, f.writelines, []) self.assertRaises(ValueError, next, f) - @unittest.expectedFailure # TODO: RUSTPYTHON; cyclic gc + @unittest.expectedFailure # TODO: RUSTPYTHON; cyclic gc def test_blockingioerror(self): # Various BlockingIOError issues class C(str): @@ -4637,15 +4584,11 @@ def test_pickling(self): with self.assertRaisesRegex(TypeError, msg): pickle.dumps(f, protocol) - @unittest.skipIf( - support.is_emscripten, "fstat() of a pipe fd is not supported" - ) + @unittest.skipIf(support.is_emscripten, "Emscripten corrupts memory when writing to nonblocking fd") def test_nonblock_pipe_write_bigbuf(self): self._test_nonblock_pipe_write(16*1024) - @unittest.skipIf( - support.is_emscripten, "fstat() of a pipe fd is not supported" - ) + @unittest.skipIf(support.is_emscripten, "Emscripten corrupts memory when writing to nonblocking fd") def test_nonblock_pipe_write_smallbuf(self): self._test_nonblock_pipe_write(1024) @@ -4764,7 +4707,6 @@ def test_check_encoding_errors(self): proc = assert_python_failure('-X', 'dev', '-c', code) self.assertEqual(proc.rc, 10, proc) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 0 != 2 def test_check_encoding_warning(self): # PEP 597: Raise warning when encoding is not specified # and sys.flags.warn_default_encoding is set. @@ -4783,12 +4725,9 @@ def test_check_encoding_warning(self): proc = assert_python_ok('-X', 'warn_default_encoding', '-c', code) warnings = proc.err.splitlines() self.assertEqual(len(warnings), 2) - self.assertTrue( - warnings[0].startswith(b":5: EncodingWarning: ")) - self.assertTrue( - warnings[1].startswith(b":8: EncodingWarning: ")) + self.assertStartsWith(warnings[0], b":5: EncodingWarning: ") + self.assertStartsWith(warnings[1], b":8: EncodingWarning: ") - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'locale' != b'utf-8' def test_text_encoding(self): # PEP 597, bpo-47000. io.text_encoding() returns "locale" or "utf-8" # based on sys.flags.utf8_mode @@ -4868,20 +4807,6 @@ def test_daemon_threads_shutdown_stdout_deadlock(self): def test_daemon_threads_shutdown_stderr_deadlock(self): self.check_daemon_threads_shutdown_deadlock('stderr') - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 22 != 10 : _PythonRunResult(rc=22, out=b'', err=b'') - def test_check_encoding_errors(self): - return super().test_check_encoding_errors() - - # TODO: RUSTPYTHON; ResourceWarning not triggered by _io.FileIO - @unittest.expectedFailure - def test_warn_on_dealloc(self): - return super().test_warn_on_dealloc() - - # TODO: RUSTPYTHON; ResourceWarning not triggered by _io.FileIO - @unittest.expectedFailure - def test_warn_on_dealloc_fd(self): - return super().test_warn_on_dealloc_fd() - class PyMiscIOTest(MiscIOTest): io = pyio @@ -5016,7 +4941,7 @@ def on_alarm(*args): os.read(r, len(data) * 100) exc = cm.exception if isinstance(exc, RuntimeError): - self.assertTrue(str(exc).startswith("reentrant call"), str(exc)) + self.assertStartsWith(str(exc), "reentrant call") finally: signal.alarm(0) wio.close() @@ -5134,13 +5059,13 @@ def alarm2(sig, frame): if e.errno != errno.EBADF: raise - @unittest.skip("TODO: RUSTPYTHON thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") + @unittest.skip("TODO: RUSTPYTHON; thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") @requires_alarm @support.requires_resource('walltime') def test_interrupted_write_retry_buffered(self): self.check_interrupted_write_retry(b"x", mode="wb") - @unittest.skip("TODO: RUSTPYTHON thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") + @unittest.skip("TODO: RUSTPYTHON; thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") @requires_alarm @support.requires_resource('walltime') def test_interrupted_write_retry_text(self): @@ -5150,9 +5075,9 @@ def test_interrupted_write_retry_text(self): class CSignalsTest(SignalsTest): io = io - @unittest.skip("TODO: RUSTPYTHON thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") - def test_interrupted_read_retry_buffered(self): # TODO: RUSTPYTHON - return super().test_interrupted_read_retry_buffered() # TODO: RUSTPYTHON + @unittest.skip("TODO: RUSTPYTHON; thread 'main' (103833) panicked at crates/vm/src/stdlib/signal.rs:233:43: RefCell already borrowed") + def test_interrupted_read_retry_buffered(self): + return super().test_interrupted_read_retry_buffered() class PySignalsTest(SignalsTest): io = pyio @@ -5163,6 +5088,26 @@ class PySignalsTest(SignalsTest): test_reentrant_write_text = None +class ProtocolsTest(unittest.TestCase): + class MyReader: + def read(self, sz=-1): + return b"" + + class MyWriter: + def write(self, b: bytes): + pass + + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'io' has no attribute 'Reader' + def test_reader_subclass(self): + self.assertIsSubclass(self.MyReader, io.Reader) + self.assertNotIsSubclass(str, io.Reader) + + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'io' has no attribute 'Writer' + def test_writer_subclass(self): + self.assertIsSubclass(self.MyWriter, io.Writer) + self.assertNotIsSubclass(str, io.Writer) + + def load_tests(loader, tests, pattern): tests = (CIOTest, PyIOTest, APIMismatchTest, CBufferedReaderTest, PyBufferedReaderTest, @@ -5174,6 +5119,7 @@ def load_tests(loader, tests, pattern): CTextIOWrapperTest, PyTextIOWrapperTest, CMiscIOTest, PyMiscIOTest, CSignalsTest, PySignalsTest, TestIOCTypes, + ProtocolsTest, ) # Put the namespaces of the IO module we are testing and some useful mock diff --git a/Lib/test/test_logging.py b/Lib/test/test_logging.py index 12b61e76423..6c0cb49f78b 100644 --- a/Lib/test/test_logging.py +++ b/Lib/test/test_logging.py @@ -5165,7 +5165,7 @@ def __init__(self, name='MyLogger', level=logging.NOTSET): h.close() logging.setLoggerClass(logging.Logger) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError during module teardown in __del__ def test_logging_at_shutdown(self): # bpo-20037: Doing text I/O late at interpreter shutdown must not crash code = textwrap.dedent(""" @@ -5185,7 +5185,7 @@ def __del__(self): self.assertIn("exception in __del__", err) self.assertIn("ValueError: some error", err) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError during module teardown in __del__ def test_logging_at_shutdown_open(self): # bpo-26789: FileHandler keeps a reference to the builtin open() # function to be able to open or reopen the file during Python diff --git a/Lib/test/test_lzma.py b/Lib/test/test_lzma.py index 1bfc9551ce3..334cb22265f 100644 --- a/Lib/test/test_lzma.py +++ b/Lib/test/test_lzma.py @@ -22,7 +22,7 @@ class CompressorDecompressorTestCase(unittest.TestCase): # Test error cases. - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_simple_bad_args(self): self.assertRaises(TypeError, LZMACompressor, []) self.assertRaises(TypeError, LZMACompressor, format=3.45) @@ -63,7 +63,7 @@ def test_simple_bad_args(self): lzd.decompress(empty) self.assertRaises(EOFError, lzd.decompress, b"quux") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Failed to initialize encoder def test_bad_filter_spec(self): self.assertRaises(TypeError, LZMACompressor, filters=[b"wobsite"]) self.assertRaises(ValueError, LZMACompressor, filters=[{"xyzzy": 3}]) @@ -80,7 +80,7 @@ def test_decompressor_after_eof(self): lzd.decompress(COMPRESSED_XZ) self.assertRaises(EOFError, lzd.decompress, b"nyan") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Unexpected keyword argument memlimit def test_decompressor_memlimit(self): lzd = LZMADecompressor(memlimit=1024) self.assertRaises(LZMAError, lzd.decompress, COMPRESSED_XZ) @@ -101,7 +101,7 @@ def _test_decompressor(self, lzd, data, check, unused_data=b""): self.assertTrue(lzd.eof) self.assertEqual(lzd.unused_data, unused_data) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_auto(self): lzd = LZMADecompressor() self._test_decompressor(lzd, COMPRESSED_XZ, lzma.CHECK_CRC64) @@ -109,37 +109,37 @@ def test_decompressor_auto(self): lzd = LZMADecompressor() self._test_decompressor(lzd, COMPRESSED_ALONE, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_xz(self): lzd = LZMADecompressor(lzma.FORMAT_XZ) self._test_decompressor(lzd, COMPRESSED_XZ, lzma.CHECK_CRC64) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_alone(self): lzd = LZMADecompressor(lzma.FORMAT_ALONE) self._test_decompressor(lzd, COMPRESSED_ALONE, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompressor_raw_1(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_1) self._test_decompressor(lzd, COMPRESSED_RAW_1, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompressor_raw_2(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_2) self._test_decompressor(lzd, COMPRESSED_RAW_2, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompressor_raw_3(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_3) self._test_decompressor(lzd, COMPRESSED_RAW_3, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompressor_raw_4(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_4) self._test_decompressor(lzd, COMPRESSED_RAW_4, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_chunks(self): lzd = LZMADecompressor() out = [] @@ -152,7 +152,7 @@ def test_decompressor_chunks(self): self.assertTrue(lzd.eof) self.assertEqual(lzd.unused_data, b"") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; EOFError: End of stream already reached def test_decompressor_chunks_empty(self): lzd = LZMADecompressor() out = [] @@ -168,7 +168,7 @@ def test_decompressor_chunks_empty(self): self.assertTrue(lzd.eof) self.assertEqual(lzd.unused_data, b"") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_chunks_maxsize(self): lzd = LZMADecompressor() max_length = 100 @@ -260,14 +260,14 @@ def test_decompressor_inputbuf_3(self): out.append(lzd.decompress(COMPRESSED_XZ[300:])) self.assertEqual(b''.join(out), INPUT) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_unused_data(self): lzd = LZMADecompressor() extra = b"fooblibar" self._test_decompressor(lzd, COMPRESSED_XZ + extra, lzma.CHECK_CRC64, unused_data=extra) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_decompressor_bad_input(self): lzd = LZMADecompressor() self.assertRaises(LZMAError, lzd.decompress, COMPRESSED_RAW_1) @@ -281,7 +281,7 @@ def test_decompressor_bad_input(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_1) self.assertRaises(LZMAError, lzd.decompress, COMPRESSED_XZ) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_decompressor_bug_28275(self): # Test coverage for Issue 28275 lzd = LZMADecompressor() @@ -291,28 +291,28 @@ def test_decompressor_bug_28275(self): # Test that LZMACompressor->LZMADecompressor preserves the input data. - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_roundtrip_xz(self): lzc = LZMACompressor() cdata = lzc.compress(INPUT) + lzc.flush() lzd = LZMADecompressor() self._test_decompressor(lzd, cdata, lzma.CHECK_CRC64) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_roundtrip_alone(self): lzc = LZMACompressor(lzma.FORMAT_ALONE) cdata = lzc.compress(INPUT) + lzc.flush() lzd = LZMADecompressor() self._test_decompressor(lzd, cdata, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_roundtrip_raw(self): lzc = LZMACompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_4) cdata = lzc.compress(INPUT) + lzc.flush() lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_4) self._test_decompressor(lzd, cdata, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_roundtrip_raw_empty(self): lzc = LZMACompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_4) cdata = lzc.compress(INPUT) @@ -323,7 +323,7 @@ def test_roundtrip_raw_empty(self): lzd = LZMADecompressor(lzma.FORMAT_RAW, filters=FILTERS_RAW_4) self._test_decompressor(lzd, cdata, lzma.CHECK_NONE) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_roundtrip_chunks(self): lzc = LZMACompressor() cdata = [] @@ -334,7 +334,7 @@ def test_roundtrip_chunks(self): lzd = LZMADecompressor() self._test_decompressor(lzd, cdata, lzma.CHECK_CRC64) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_roundtrip_empty_chunks(self): lzc = LZMACompressor() cdata = [] @@ -350,7 +350,7 @@ def test_roundtrip_empty_chunks(self): # LZMADecompressor intentionally does not handle concatenated streams. - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'LZMADecompressor' object has no attribute 'check' def test_decompressor_multistream(self): lzd = LZMADecompressor() self._test_decompressor(lzd, COMPRESSED_XZ + COMPRESSED_ALONE, @@ -411,7 +411,7 @@ class CompressDecompressFunctionTestCase(unittest.TestCase): # Test error cases: - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Failed to initialize encoder def test_bad_args(self): self.assertRaises(TypeError, lzma.compress) self.assertRaises(TypeError, lzma.compress, []) @@ -441,7 +441,7 @@ def test_bad_args(self): lzma.decompress( b"", format=lzma.FORMAT_ALONE, filters=FILTERS_RAW_1) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: memory limit reached def test_decompress_memlimit(self): with self.assertRaises(LZMAError): lzma.decompress(COMPRESSED_XZ, memlimit=1024) @@ -454,7 +454,7 @@ def test_decompress_memlimit(self): # Test LZMADecompressor on known-good input data. - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompress_good_input(self): ddata = lzma.decompress(COMPRESSED_XZ) self.assertEqual(ddata, INPUT) @@ -484,7 +484,7 @@ def test_decompress_good_input(self): COMPRESSED_RAW_4, lzma.FORMAT_RAW, filters=FILTERS_RAW_4) self.assertEqual(ddata, INPUT) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_decompress_incomplete_input(self): self.assertRaises(LZMAError, lzma.decompress, COMPRESSED_XZ[:128]) self.assertRaises(LZMAError, lzma.decompress, COMPRESSED_ALONE[:128]) @@ -497,7 +497,7 @@ def test_decompress_incomplete_input(self): self.assertRaises(LZMAError, lzma.decompress, COMPRESSED_RAW_4[:128], format=lzma.FORMAT_RAW, filters=FILTERS_RAW_4) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_decompress_bad_input(self): with self.assertRaises(LZMAError): lzma.decompress(COMPRESSED_BOGUS) @@ -513,7 +513,7 @@ def test_decompress_bad_input(self): # Test that compress()->decompress() preserves the input data. - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_roundtrip(self): cdata = lzma.compress(INPUT) ddata = lzma.decompress(cdata) @@ -539,12 +539,12 @@ def test_decompress_multistream(self): # Test robust handling of non-LZMA data following the compressed stream(s). - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_decompress_trailing_junk(self): ddata = lzma.decompress(COMPRESSED_XZ + COMPRESSED_BOGUS) self.assertEqual(ddata, INPUT) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_decompress_multistream_trailing_junk(self): ddata = lzma.decompress(COMPRESSED_XZ * 3 + COMPRESSED_BOGUS) self.assertEqual(ddata, INPUT * 3) @@ -581,7 +581,7 @@ def test_init(self): self.assertIsInstance(f, LZMAFile) self.assertEqual(f.mode, "wb") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: != '@test_23396_tmp챈' def test_init_with_PathLike_filename(self): filename = FakePath(TESTFN) with TempFile(filename, COMPRESSED_XZ): @@ -662,7 +662,7 @@ def test_init_bad_mode(self): with self.assertRaises(ValueError): LZMAFile(BytesIO(COMPRESSED_XZ), "rw") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Invalid check value def test_init_bad_check(self): with self.assertRaises(TypeError): LZMAFile(BytesIO(), "w", check=b"asd") @@ -683,7 +683,7 @@ def test_init_bad_check(self): with self.assertRaises(ValueError): LZMAFile(BytesIO(COMPRESSED_XZ), check=lzma.CHECK_UNKNOWN) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OverflowError: Python int too large to convert to Rust u32 def test_init_bad_preset(self): with self.assertRaises(TypeError): LZMAFile(BytesIO(), "w", preset=4.39) @@ -703,7 +703,7 @@ def test_init_bad_preset(self): with self.assertRaises(ValueError): LZMAFile(BytesIO(COMPRESSED_XZ), preset=3) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Failed to initialize encoder def test_init_bad_filter_spec(self): with self.assertRaises(TypeError): LZMAFile(BytesIO(), "w", filters=[b"wobsite"]) @@ -721,7 +721,7 @@ def test_init_bad_filter_spec(self): LZMAFile(BytesIO(), "w", filters=[{"id": lzma.FILTER_X86, "foo": 0}]) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_init_with_preset_and_filters(self): with self.assertRaises(ValueError): LZMAFile(BytesIO(), "w", format=lzma.FORMAT_RAW, @@ -840,7 +840,7 @@ def test_writable(self): f.close() self.assertRaises(ValueError, f.writable) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_read(self): with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertEqual(f.read(), INPUT) @@ -888,7 +888,7 @@ def test_read_10(self): chunks.append(result) self.assertEqual(b"".join(chunks), INPUT) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_read_multistream(self): with LZMAFile(BytesIO(COMPRESSED_XZ * 5)) as f: self.assertEqual(f.read(), INPUT * 5) @@ -909,12 +909,12 @@ def test_read_multistream_buffer_size_aligned(self): finally: _streams.BUFFER_SIZE = saved_buffer_size - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_read_trailing_junk(self): with LZMAFile(BytesIO(COMPRESSED_XZ + COMPRESSED_BOGUS)) as f: self.assertEqual(f.read(), INPUT) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_read_multistream_trailing_junk(self): with LZMAFile(BytesIO(COMPRESSED_XZ * 5 + COMPRESSED_BOGUS)) as f: self.assertEqual(f.read(), INPUT * 5) @@ -1020,7 +1020,7 @@ def test_read_bad_args(self): with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertRaises(TypeError, f.read, float()) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; OSError: stream/file format not recognized def test_read_bad_data(self): with LZMAFile(BytesIO(COMPRESSED_BOGUS)) as f: self.assertRaises(LZMAError, f.read) @@ -1078,7 +1078,7 @@ def test_peek_bad_args(self): with LZMAFile(BytesIO(), "w") as f: self.assertRaises(ValueError, f.peek) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_iterator(self): with BytesIO(INPUT) as f: lines = f.readlines() @@ -1118,7 +1118,7 @@ def test_decompress_limited(self): self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp, "Excessive amount of data was decompressed") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; lzma.LZMAError: Invalid format def test_write(self): with BytesIO() as dst: with LZMAFile(dst, "w") as f: @@ -1387,7 +1387,7 @@ def test_tell_bad_args(self): f.close() self.assertRaises(ValueError, f.tell) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: True is not false def test_issue21872(self): # sometimes decompress data incompletely @@ -1471,7 +1471,7 @@ def test_filename(self): with lzma.open(TESTFN, "rb") as f: self.assertEqual(f.read(), INPUT * 2) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: != '@test_23396_tmp챈' def test_with_pathlike_filename(self): filename = FakePath(TESTFN) with TempFile(filename): @@ -1498,7 +1498,7 @@ def test_bad_params(self): with self.assertRaises(ValueError): lzma.open(TESTFN, "rb", newline="\n") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: Expected type 'int' but 'list' found. def test_format_and_filters(self): # Test non-default format and filter chain. options = {"format": lzma.FORMAT_RAW, "filters": FILTERS_RAW_1} @@ -1529,7 +1529,6 @@ def test_encoding_error_handler(self): with lzma.open(bio, "rt", encoding="ascii", errors="ignore") as f: self.assertEqual(f.read(), "foobar") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_newline(self): # Test with explicit newline (universal newline mode disabled). text = INPUT.decode("ascii") @@ -1554,7 +1553,7 @@ def test_x_mode(self): class MiscellaneousTestCase(unittest.TestCase): - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'lzma' has no attribute 'CHECK_ID_MAX' def test_is_check_supported(self): # CHECK_NONE and CHECK_CRC32 should always be supported, # regardless of the options liblzma was compiled with. @@ -1567,7 +1566,7 @@ def test_is_check_supported(self): # This value should not be a valid check ID. self.assertFalse(lzma.is_check_supported(lzma.CHECK_UNKNOWN)) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: expected at most 0 arguments, got 1 def test__encode_filter_properties(self): with self.assertRaises(TypeError): lzma._encode_filter_properties(b"not a dict") @@ -1589,7 +1588,7 @@ def test__encode_filter_properties(self): }) self.assertEqual(props, b"]\x00\x00\x80\x00") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: LZMAError not raised def test__decode_filter_properties(self): with self.assertRaises(TypeError): lzma._decode_filter_properties(lzma.FILTER_X86, {"should be": bytes}) @@ -1613,7 +1612,7 @@ def test__decode_filter_properties(self): filterspec = lzma._decode_filter_properties(f, b"") self.assertEqual(filterspec, {"id": f}) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: expected at most 0 arguments, got 1 def test_filter_properties_roundtrip(self): spec1 = lzma._decode_filter_properties( lzma.FILTER_LZMA1, b"]\x00\x00\x80\x00") diff --git a/Lib/test/test_plistlib.py b/Lib/test/test_plistlib.py index cad53c17837..389da145e6d 100644 --- a/Lib/test/test_plistlib.py +++ b/Lib/test/test_plistlib.py @@ -752,7 +752,6 @@ def test_non_bmp_characters(self): data = plistlib.dumps(pl, fmt=fmt) self.assertEqual(plistlib.loads(data), pl) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_lone_surrogates(self): for fmt in ALL_FORMATS: with self.subTest(fmt=fmt): diff --git a/Lib/test/test_regrtest.py b/Lib/test/test_regrtest.py index ee1d479b884..82939108b12 100644 --- a/Lib/test/test_regrtest.py +++ b/Lib/test/test_regrtest.py @@ -1195,7 +1195,7 @@ def test_slowest_interrupted(self): regex = ('10 slowest tests:\n') self.check_line(output, regex) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: Regex didn't match: '^lines +cov% +module +\\(path\\)\\n(?: *[0-9]+ *[0-9]{1,2}\\.[0-9]% *[^ ]+ +\\([^)]+\\)+)+' not found in 'Warning: collecting coverage without -j is imprecise. Configure --with-pydebug and run -m test -T -j for best results.\nUsing random seed: 2780369491\n0:00:00 Run 1 test sequentially in a single process\n0:00:00 [1/1] test_regrtest_coverage\n0:00:00 [1/1] test_regrtest_coverage passed\n\n== Tests result: SUCCESS ==\n\n1 test OK.\n\nTotal duration: 102 ms\nTotal tests: run=1\nTotal test files: run=1/1\nResult: SUCCESS\n' def test_coverage(self): # test --coverage test = self.create_test('coverage') @@ -1870,7 +1870,7 @@ def test_sleep(self): self.assertRegex(output, re.compile('%s timed out' % testname, re.MULTILINE)) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; test_unraisable_exc (test_regrtest_noop39.Tests.test_unraisable_exc) ... ok def test_unraisable_exc(self): # --fail-env-changed must catch unraisable exception. # The exception must be displayed even if sys.stderr is redirected. @@ -2324,7 +2324,7 @@ def test_pass(self): self.check_executed_tests(output, testname, stats=1, parallel=True) self.assertNotIn('SPAM SPAM SPAM', output) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' def test_xml(self): code = textwrap.dedent(r""" import unittest @@ -2362,7 +2362,6 @@ def test_failed(self): for out in testcase.iter('system-out'): self.assertEqual(out.text, r"abc \x1b def") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_nonascii(self): code = textwrap.dedent(r""" import unittest diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 6b766272a3f..78a8dc24cce 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -112,7 +112,7 @@ def test_literals(self): # raw strings should not have unicode escapes self.assertNotEqual(r"\u0020", " ") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: is not def test_ascii(self): self.assertEqual(ascii('abc'), "'abc'") self.assertEqual(ascii('ab\\c'), "'ab\\\\c'") @@ -793,7 +793,7 @@ def test_isdecimal(self): for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']: self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False != True def test_isdigit(self): super().test_isdigit() self.checkequalnofix(True, '\u2460', 'isdigit') @@ -939,7 +939,7 @@ def test_upper(self): self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') self.assertEqual('\u2177'.upper(), '\u2167') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_capitalize(self): string_tests.StringLikeTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') @@ -957,7 +957,7 @@ def test_capitalize(self): self.assertEqual('finnish'.capitalize(), 'Finnish') self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_title(self): super().test_title() self.assertEqual('\U0001044F'.title(), '\U00010427') @@ -975,7 +975,7 @@ def test_title(self): self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; + 𐐧 def test_swapcase(self): string_tests.StringLikeTest.test_swapcase(self) self.assertEqual('\U0001044F'.swapcase(), '\U00010427') @@ -1075,7 +1075,7 @@ def test_issue18183(self): '\U00100000'.ljust(3, '\U00010000') '\U00100000'.rjust(3, '\U00010000') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; ? + def test_format(self): self.assertEqual(''.format(), '') self.assertEqual('a'.format(), 'a') @@ -1464,13 +1464,13 @@ def test_format_huge_precision(self): with self.assertRaises(ValueError): result = format(2.34, format_string) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: ValueError not raised def test_format_huge_width(self): format_string = "{}f".format(sys.maxsize + 1) with self.assertRaises(ValueError): result = format(2.34, format_string) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; IndexError: tuple index out of range def test_format_huge_item_number(self): format_string = "{{{}:.6f}}".format(sys.maxsize + 1) with self.assertRaises(ValueError): @@ -1506,7 +1506,7 @@ def __format__(self, spec): self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3') self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: %x format: an integer is required, not PseudoInt def test_formatting(self): string_tests.StringLikeTest.test_formatting(self) # Testing Unicode formatting strings... @@ -1755,7 +1755,7 @@ def __str__(self): 'character buffers are decoded to unicode' ) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; Pass various keyword argument combinations to the constructor. def test_constructor_keyword_args(self): """Pass various keyword argument combinations to the constructor.""" # The object argument can be passed as a keyword. @@ -1765,7 +1765,7 @@ def test_constructor_keyword_args(self): self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'" self.assertEqual(str(object=b'foo', errors='strict'), 'foo') - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; Check the constructor argument defaults. def test_constructor_defaults(self): """Check the constructor argument defaults.""" # The object argument defaults to '' or b''. @@ -1777,7 +1777,6 @@ def test_constructor_defaults(self): # The errors argument defaults to strict. self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii') - @unittest.expectedFailure # TODO: RUSTPYTHON def test_codecs_utf7(self): utfTests = [ ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example @@ -2287,7 +2286,6 @@ def test_codecs_errors(self): self.assertRaises(ValueError, complex, "\ud800") self.assertRaises(ValueError, complex, "\udf00") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_codecs(self): # Encoding self.assertEqual('hello'.encode('ascii'), b'hello') @@ -2417,7 +2415,7 @@ def test_ucs4(self): else: self.fail("Should have raised UnicodeDecodeError") - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: is not def test_conversion(self): # Make sure __str__() works properly class StrWithStr(str): @@ -2476,7 +2474,7 @@ def test_expandtabs_optimization(self): s = 'abc' self.assertIs(s.expandtabs(), s) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON def test_raiseMemError(self): asciifields = "nnb" compactfields = asciifields + "nP" @@ -2616,12 +2614,12 @@ def test_compare(self): self.assertTrue(astral >= bmp2) self.assertFalse(astral >= astral2) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true def test_free_after_iterating(self): support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, reversed, str) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 22 != 10 : _PythonRunResult(rc=22, out=b'', err=b'') def test_check_encoding_errors(self): # bpo-37388: str(bytes) and str.decode() must check encoding and errors # arguments in dev mode @@ -2682,7 +2680,7 @@ def test_check_encoding_errors(self): proc = assert_python_failure('-X', 'dev', '-c', code) self.assertEqual(proc.rc, 10, proc) - @unittest.expectedFailure # TODO: RUSTPYTHON + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: "str expected at most 3 arguments, got 4" does not match "expected at most 3 arguments, got 4" def test_str_invalid_call(self): # too many args with self.assertRaisesRegex(TypeError, r"str expected at most 3 arguments, got 4"): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 92ce7c32a00..7a921d569a7 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -1988,8 +1988,6 @@ class UnicodeTest: def test_iso8859_1_filename(self): self._test_unicode_filename("iso8859-1") - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_utf7_filename(self): self._test_unicode_filename("utf7") @@ -2416,8 +2414,7 @@ def test__all__(self): 'SubsequentHeaderError', 'ExFileObject', 'main'} support.check__all__(self, tarfile, not_exported=not_exported) - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; FileNotFoundError: [Errno 2] No such file or directory: '/Users/al03219714/Projects/RustPython3/crates/pylib/Lib/test/testtar.tar.xz' def test_useful_error_message_when_modules_missing(self): fname = os.path.join(os.path.dirname(__file__), 'testtar.tar.xz') with self.assertRaises(tarfile.ReadError) as excinfo: diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py index b3e3e0bb27f..176a2112718 100644 --- a/Lib/test/test_utf8_mode.py +++ b/Lib/test/test_utf8_mode.py @@ -46,8 +46,7 @@ def test_posix_locale(self): out = self.get_output('-c', code, LC_ALL=loc) self.assertEqual(out, '1') - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailureIf(MS_WINDOWS, "TODO: RUSTPYTHON") def test_xoption(self): code = 'import sys; print(sys.flags.utf8_mode)' diff --git a/crates/common/src/encodings.rs b/crates/common/src/encodings.rs index c2f139b6bb9..913f0521e16 100644 --- a/crates/common/src/encodings.rs +++ b/crates/common/src/encodings.rs @@ -441,13 +441,22 @@ pub mod errors { let err_str = &ctx.full_data()[range.start.bytes..range.end.bytes]; let num_chars = range.end.chars - range.start.chars; let mut out = Vec::with_capacity(num_chars); + let mut pos = range.start; for ch in err_str.code_points() { - let ch = ch.to_u32(); - if !(0xdc80..=0xdcff).contains(&ch) { - // Not a UTF-8b surrogate, fail with original exception - return Err(ctx.error_encoding(range, reason)); + let ch_u32 = ch.to_u32(); + if !(0xdc80..=0xdcff).contains(&ch_u32) { + if out.is_empty() { + // Can't handle even the first character + return Err(ctx.error_encoding(range, reason)); + } + // Return partial result, restart from this character + return Ok((EncodeReplace::Bytes(ctx.bytes(out)), pos)); } - out.push((ch - 0xdc00) as u8); + out.push((ch_u32 - 0xdc00) as u8); + pos += StrSize { + bytes: ch.len_wtf8(), + chars: 1, + }; } Ok((EncodeReplace::Bytes(ctx.bytes(out)), range.end)) } diff --git a/crates/vm/src/stdlib/codecs.rs b/crates/vm/src/stdlib/codecs.rs index 011eaca23b7..f1fdbf1bdcd 100644 --- a/crates/vm/src/stdlib/codecs.rs +++ b/crates/vm/src/stdlib/codecs.rs @@ -1,3 +1,5 @@ +// spell-checker: ignore unencodable pused + pub(crate) use _codecs::module_def; use crate::common::static_cell::StaticCell; @@ -11,6 +13,7 @@ mod _codecs { AsObject, PyObjectRef, PyResult, VirtualMachine, builtins::{PyStrRef, PyUtf8StrRef}, codecs, + exceptions::cstring_error, function::{ArgBytesLike, FuncArgs}, }; @@ -26,6 +29,9 @@ mod _codecs { #[pyfunction] fn lookup(encoding: PyUtf8StrRef, vm: &VirtualMachine) -> PyResult { + if encoding.as_str().contains('\0') { + return Err(cstring_error(vm)); + } vm.state .codec_registry .lookup(encoding.as_str(), vm) @@ -81,6 +87,14 @@ mod _codecs { #[pyfunction] fn lookup_error(name: PyStrRef, vm: &VirtualMachine) -> PyResult { + if name.as_wtf8().as_bytes().contains(&0) { + return Err(cstring_error(vm)); + } + if !name.as_wtf8().is_utf8() { + return Err(vm.new_unicode_encode_error( + "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), + )); + } vm.state.codec_registry.lookup_error(name.as_str(), vm) } @@ -290,6 +304,10 @@ mod _codecs { delegate_pycodecs!(utf_16_ex_decode, args, vm) } #[pyfunction] + fn utf_32_ex_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_ex_decode, args, vm) + } + #[pyfunction] fn utf_32_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { delegate_pycodecs!(utf_32_encode, args, vm) } @@ -335,7 +353,6 @@ mod _codecs_windows { use crate::{PyResult, VirtualMachine}; use crate::{builtins::PyStrRef, function::ArgBytesLike}; - #[cfg(windows)] #[derive(FromArgs)] struct MbcsEncodeArgs { #[pyarg(positional)] @@ -344,7 +361,6 @@ mod _codecs_windows { errors: Option, } - #[cfg(windows)] #[pyfunction] fn mbcs_encode(args: MbcsEncodeArgs, vm: &VirtualMachine) -> PyResult<(Vec, usize)> { use crate::common::windows::ToWideString; @@ -425,7 +441,6 @@ mod _codecs_windows { Ok((buffer, char_len)) } - #[cfg(windows)] #[derive(FromArgs)] struct MbcsDecodeArgs { #[pyarg(positional)] @@ -437,7 +452,6 @@ mod _codecs_windows { r#final: bool, } - #[cfg(windows)] #[pyfunction] fn mbcs_decode(args: MbcsDecodeArgs, vm: &VirtualMachine) -> PyResult<(String, usize)> { use windows_sys::Win32::Globalization::{ @@ -525,7 +539,6 @@ mod _codecs_windows { Ok((s, len)) } - #[cfg(windows)] #[derive(FromArgs)] struct OemEncodeArgs { #[pyarg(positional)] @@ -534,7 +547,6 @@ mod _codecs_windows { errors: Option, } - #[cfg(windows)] #[pyfunction] fn oem_encode(args: OemEncodeArgs, vm: &VirtualMachine) -> PyResult<(Vec, usize)> { use crate::common::windows::ToWideString; @@ -615,7 +627,6 @@ mod _codecs_windows { Ok((buffer, char_len)) } - #[cfg(windows)] #[derive(FromArgs)] struct OemDecodeArgs { #[pyarg(positional)] @@ -627,7 +638,6 @@ mod _codecs_windows { r#final: bool, } - #[cfg(windows)] #[pyfunction] fn oem_decode(args: OemDecodeArgs, vm: &VirtualMachine) -> PyResult<(String, usize)> { use windows_sys::Win32::Globalization::{ @@ -715,7 +725,6 @@ mod _codecs_windows { Ok((s, len)) } - #[cfg(windows)] #[derive(FromArgs)] struct CodePageEncodeArgs { #[pyarg(positional)] @@ -726,50 +735,48 @@ mod _codecs_windows { errors: Option, } - #[cfg(windows)] - #[pyfunction] - fn code_page_encode( - args: CodePageEncodeArgs, - vm: &VirtualMachine, - ) -> PyResult<(Vec, usize)> { - use crate::common::windows::ToWideString; - use windows_sys::Win32::Globalization::{WC_NO_BEST_FIT_CHARS, WideCharToMultiByte}; - - if args.code_page < 0 { - return Err(vm.new_value_error("invalid code page number".to_owned())); + fn code_page_encoding_name(code_page: u32) -> String { + match code_page { + 0 => "mbcs".to_string(), + cp => format!("cp{cp}"), } - let errors = args.errors.as_ref().map(|s| s.as_str()).unwrap_or("strict"); - let code_page = args.code_page as u32; - let s = match args.s.to_str() { - Some(s) => s, - None => { - return Err(vm.new_unicode_encode_error(format!( - "'cp{code_page}' codec can't encode character: surrogates not allowed" - ))); - } - }; - let char_len = args.s.char_len(); + } - if s.is_empty() { - return Ok((Vec::new(), char_len)); + /// Get WideCharToMultiByte flags for encoding. + /// Matches encode_code_page_flags() in CPython. + fn encode_code_page_flags(code_page: u32, errors: &str) -> u32 { + use windows_sys::Win32::Globalization::{WC_ERR_INVALID_CHARS, WC_NO_BEST_FIT_CHARS}; + if code_page == 65001 { + // CP_UTF8 + WC_ERR_INVALID_CHARS + } else if code_page == 65000 { + // CP_UTF7 only supports flags=0 + 0 + } else if errors == "replace" { + 0 + } else { + WC_NO_BEST_FIT_CHARS } + } - let wide: Vec = std::ffi::OsStr::new(s).to_wide(); + /// Try to encode the entire wide string at once (fast/strict path). + /// Returns Ok(Some(bytes)) on success, Ok(None) if there are unencodable chars, + /// or Err on OS error. + fn try_encode_code_page_strict( + code_page: u32, + wide: &[u16], + vm: &VirtualMachine, + ) -> PyResult>> { + use windows_sys::Win32::Globalization::WideCharToMultiByte; - // Some code pages (like UTF-7/8, 50220-50222, etc.) don't support WC_NO_BEST_FIT_CHARS - let flags = if code_page == 65000 - || code_page == 65001 - || code_page == 42 - || (50220..=50222).contains(&code_page) - || code_page == 50225 - || code_page == 50227 - || code_page == 50229 - || (57002..=57011).contains(&code_page) - || code_page == 54936 - { - 0 + let flags = encode_code_page_flags(code_page, "strict"); + + let use_default_char = code_page != 65001 && code_page != 65000; + let mut used_default_char: i32 = 0; + let pused = if use_default_char { + &mut used_default_char as *mut i32 } else { - WC_NO_BEST_FIT_CHARS + std::ptr::null_mut() }; let size = unsafe { @@ -781,17 +788,31 @@ mod _codecs_windows { std::ptr::null_mut(), 0, core::ptr::null(), - std::ptr::null_mut(), + pused, ) }; - if size == 0 { + if size <= 0 { + let err_code = std::io::Error::last_os_error().raw_os_error().unwrap_or(0); + if err_code == 1113 { + // ERROR_NO_UNICODE_TRANSLATION + return Ok(None); + } let err = std::io::Error::last_os_error(); - return Err(vm.new_os_error(format!("code_page_encode failed: {err}"))); + return Err(vm.new_os_error(format!("code_page_encode: {err}"))); + } + + if use_default_char && used_default_char != 0 { + return Ok(None); } let mut buffer = vec![0u8; size as usize]; - let mut used_default_char: i32 = 0; + used_default_char = 0; + let pused = if use_default_char { + &mut used_default_char as *mut i32 + } else { + std::ptr::null_mut() + }; let result = unsafe { WideCharToMultiByte( @@ -802,30 +823,235 @@ mod _codecs_windows { buffer.as_mut_ptr().cast(), size, core::ptr::null(), - if errors == "strict" && flags != 0 { - &mut used_default_char - } else { - std::ptr::null_mut() - }, + pused, ) }; - if result == 0 { + if result <= 0 { + let err_code = std::io::Error::last_os_error().raw_os_error().unwrap_or(0); + if err_code == 1113 { + return Ok(None); + } let err = std::io::Error::last_os_error(); - return Err(vm.new_os_error(format!("code_page_encode failed: {err}"))); + return Err(vm.new_os_error(format!("code_page_encode: {err}"))); } - if errors == "strict" && used_default_char != 0 { - return Err(vm.new_unicode_encode_error(format!( - "'cp{code_page}' codec can't encode characters: invalid character" - ))); + if use_default_char && used_default_char != 0 { + return Ok(None); } buffer.truncate(result as usize); - Ok((buffer, char_len)) + Ok(Some(buffer)) + } + + /// Encode character by character with error handling. + fn encode_code_page_errors( + code_page: u32, + s: &PyStrRef, + errors: &str, + encoding_name: &str, + vm: &VirtualMachine, + ) -> PyResult<(Vec, usize)> { + use crate::builtins::{PyBytes, PyStr, PyTuple}; + use windows_sys::Win32::Globalization::WideCharToMultiByte; + + let char_len = s.char_len(); + let flags = encode_code_page_flags(code_page, errors); + let use_default_char = code_page != 65001 && code_page != 65000; + let encoding_str = vm.ctx.new_str(encoding_name); + let reason_str = vm.ctx.new_str("invalid character"); + + // For strict mode, find the first unencodable character and raise + if errors == "strict" { + // Find the failing position by trying each character + let mut fail_pos = 0; + for cp in s.as_wtf8().code_points() { + let ch = cp.to_u32(); + if (0xD800..=0xDFFF).contains(&ch) { + break; + } + let mut wchars = [0u16; 2]; + let wchar_len = if ch < 0x10000 { + wchars[0] = ch as u16; + 1 + } else { + wchars[0] = ((ch - 0x10000) >> 10) as u16 + 0xD800; + wchars[1] = ((ch - 0x10000) & 0x3FF) as u16 + 0xDC00; + 2 + }; + let mut used_default_char: i32 = 0; + let pused = if use_default_char { + &mut used_default_char as *mut i32 + } else { + std::ptr::null_mut() + }; + let outsize = unsafe { + WideCharToMultiByte( + code_page, + flags, + wchars.as_ptr(), + wchar_len, + std::ptr::null_mut(), + 0, + core::ptr::null(), + pused, + ) + }; + if outsize <= 0 || (use_default_char && used_default_char != 0) { + break; + } + fail_pos += 1; + } + return Err(vm.new_unicode_encode_error_real( + encoding_str, + s.clone(), + fail_pos, + fail_pos + 1, + reason_str, + )); + } + + let error_handler = vm.state.codec_registry.lookup_error(errors, vm)?; + let mut output = Vec::new(); + + // Collect code points for random access + let code_points: Vec = s.as_wtf8().code_points().map(|cp| cp.to_u32()).collect(); + + let mut pos = 0usize; + while pos < code_points.len() { + let ch = code_points[pos]; + + // Convert code point to UTF-16 + let mut wchars = [0u16; 2]; + let wchar_len; + let is_surrogate = (0xD800..=0xDFFF).contains(&ch); + + if is_surrogate { + wchar_len = 0; // Can't encode surrogates normally + } else if ch < 0x10000 { + wchars[0] = ch as u16; + wchar_len = 1; + } else { + wchars[0] = ((ch - 0x10000) >> 10) as u16 + 0xD800; + wchars[1] = ((ch - 0x10000) & 0x3FF) as u16 + 0xDC00; + wchar_len = 2; + } + + if !is_surrogate { + let mut used_default_char: i32 = 0; + let pused = if use_default_char { + &mut used_default_char as *mut i32 + } else { + std::ptr::null_mut() + }; + + let mut buf = [0u8; 8]; + let outsize = unsafe { + WideCharToMultiByte( + code_page, + flags, + wchars.as_ptr(), + wchar_len, + buf.as_mut_ptr().cast(), + buf.len() as i32, + core::ptr::null(), + pused, + ) + }; + + if outsize > 0 && (!use_default_char || used_default_char == 0) { + output.extend_from_slice(&buf[..outsize as usize]); + pos += 1; + continue; + } + } + + // Character can't be encoded - call error handler + let exc = vm.new_unicode_encode_error_real( + encoding_str.clone(), + s.clone(), + pos, + pos + 1, + reason_str.clone(), + ); + + let res = error_handler.call((exc,), vm)?; + let tuple_err = + || vm.new_type_error("encoding error handler must return (str/bytes, int) tuple"); + let tuple: &PyTuple = res.downcast_ref().ok_or_else(&tuple_err)?; + let tuple_slice = tuple.as_slice(); + if tuple_slice.len() != 2 { + return Err(tuple_err()); + } + + let replacement = &tuple_slice[0]; + let new_pos_obj = tuple_slice[1].clone(); + + if let Some(bytes) = replacement.downcast_ref::() { + output.extend_from_slice(bytes); + } else if let Some(rep_str) = replacement.downcast_ref::() { + // Replacement string - try to encode each character + for rcp in rep_str.as_wtf8().code_points() { + let rch = rcp.to_u32(); + if rch > 127 { + return Err(vm.new_unicode_encode_error_real( + encoding_str.clone(), + s.clone(), + pos, + pos + 1, + vm.ctx + .new_str("unable to encode error handler result to ASCII"), + )); + } + output.push(rch as u8); + } + } else { + return Err(tuple_err()); + } + + let new_pos: isize = new_pos_obj.try_into_value(vm).map_err(|_| tuple_err())?; + pos = if new_pos < 0 { + (code_points.len() as isize + new_pos).max(0) as usize + } else { + new_pos as usize + }; + } + + Ok((output, char_len)) + } + + #[pyfunction] + fn code_page_encode( + args: CodePageEncodeArgs, + vm: &VirtualMachine, + ) -> PyResult<(Vec, usize)> { + use crate::common::windows::ToWideString; + + if args.code_page < 0 { + return Err(vm.new_value_error("invalid code page number".to_owned())); + } + let errors = args.errors.as_ref().map(|s| s.as_str()).unwrap_or("strict"); + let code_page = args.code_page as u32; + let char_len = args.s.char_len(); + + if char_len == 0 { + return Ok((Vec::new(), 0)); + } + + let encoding_name = code_page_encoding_name(code_page); + + // Fast path: try encoding the whole string at once (only if no surrogates) + if let Some(str_data) = args.s.to_str() { + let wide: Vec = std::ffi::OsStr::new(str_data).to_wide(); + if let Some(result) = try_encode_code_page_strict(code_page, &wide, vm)? { + return Ok((result, char_len)); + } + } + + // Slow path: character by character with error handling + encode_code_page_errors(code_page, &args.s, errors, &encoding_name, vm) } - #[cfg(windows)] #[derive(FromArgs)] struct CodePageDecodeArgs { #[pyarg(positional)] @@ -835,112 +1061,311 @@ mod _codecs_windows { #[pyarg(positional, optional)] errors: Option, #[pyarg(positional, default = false)] - #[allow(dead_code)] r#final: bool, } - #[cfg(windows)] - #[pyfunction] - fn code_page_decode( - args: CodePageDecodeArgs, + /// Try to decode the entire buffer with strict flags (fast path). + /// Returns Ok(Some(wide_chars)) on success, Ok(None) on decode error, + /// or Err on OS error. + fn try_decode_code_page_strict( + code_page: u32, + data: &[u8], vm: &VirtualMachine, - ) -> PyResult<(String, usize)> { + ) -> PyResult>> { use windows_sys::Win32::Globalization::{MB_ERR_INVALID_CHARS, MultiByteToWideChar}; - if args.code_page < 0 { - return Err(vm.new_value_error("invalid code page number".to_owned())); - } - let _errors = args.errors.as_ref().map(|s| s.as_str()).unwrap_or("strict"); - let code_page = args.code_page as u32; - let data = args.data.borrow_buf(); - let len = data.len(); - - if data.is_empty() { - return Ok((String::new(), 0)); - } - - // Some code pages don't support MB_ERR_INVALID_CHARS - let strict_flags = if code_page == 65000 - || code_page == 42 - || (50220..=50222).contains(&code_page) - || code_page == 50225 - || code_page == 50227 - || code_page == 50229 - || (57002..=57011).contains(&code_page) - { - 0 - } else { - MB_ERR_INVALID_CHARS - }; - - let size = unsafe { - MultiByteToWideChar( - code_page, - strict_flags, - data.as_ptr().cast(), - len as i32, - std::ptr::null_mut(), - 0, - ) - }; + let mut flags = MB_ERR_INVALID_CHARS; - if size == 0 { + loop { let size = unsafe { MultiByteToWideChar( code_page, - 0, + flags, data.as_ptr().cast(), - len as i32, + data.len() as i32, std::ptr::null_mut(), 0, ) }; - if size == 0 { - let err = std::io::Error::last_os_error(); - return Err(vm.new_os_error(format!("code_page_decode failed: {err}"))); + if size > 0 { + let mut buffer = vec![0u16; size as usize]; + let result = unsafe { + MultiByteToWideChar( + code_page, + flags, + data.as_ptr().cast(), + data.len() as i32, + buffer.as_mut_ptr(), + size, + ) + }; + if result > 0 { + buffer.truncate(result as usize); + return Ok(Some(buffer)); + } } - let mut buffer = vec![0u16; size as usize]; - let result = unsafe { - MultiByteToWideChar( - code_page, - 0, - data.as_ptr().cast(), - len as i32, - buffer.as_mut_ptr(), - size, - ) - }; - if result == 0 { - let err = std::io::Error::last_os_error(); - return Err(vm.new_os_error(format!("code_page_decode failed: {err}"))); + let err_code = std::io::Error::last_os_error().raw_os_error().unwrap_or(0); + // ERROR_INVALID_FLAGS = 1004 + if flags != 0 && err_code == 1004 { + flags = 0; + continue; } - buffer.truncate(result as usize); - let s = String::from_utf16(&buffer).map_err(|e| { - vm.new_unicode_decode_error(format!("code_page_decode failed: {e}")) - })?; - return Ok((s, len)); + // ERROR_NO_UNICODE_TRANSLATION = 1113 + if err_code == 1113 { + return Ok(None); + } + let err = std::io::Error::last_os_error(); + return Err(vm.new_os_error(format!("code_page_decode: {err}"))); } + } - let mut buffer = vec![0u16; size as usize]; - let result = unsafe { - MultiByteToWideChar( - code_page, - strict_flags, - data.as_ptr().cast(), - len as i32, - buffer.as_mut_ptr(), - size, - ) + /// Decode byte by byte with error handling (slow path). + fn decode_code_page_errors( + code_page: u32, + data: &[u8], + errors: &str, + is_final: bool, + encoding_name: &str, + vm: &VirtualMachine, + ) -> PyResult<(PyStrRef, usize)> { + use crate::builtins::PyTuple; + use crate::common::wtf8::Wtf8Buf; + use windows_sys::Win32::Globalization::{MB_ERR_INVALID_CHARS, MultiByteToWideChar}; + + let len = data.len(); + let encoding_str = vm.ctx.new_str(encoding_name); + let reason_str = vm + .ctx + .new_str("No mapping for the Unicode character exists in the target code page."); + + // For strict+final, find the failing position and raise + if errors == "strict" && is_final { + // Find the exact failing byte position by trying byte by byte + let mut fail_pos = 0; + let mut flags_s: u32 = MB_ERR_INVALID_CHARS; + let mut buf = [0u16; 2]; + while fail_pos < len { + let mut in_size = 1; + let mut found = false; + while in_size <= 4 && fail_pos + in_size <= len { + let outsize = unsafe { + MultiByteToWideChar( + code_page, + flags_s, + data[fail_pos..].as_ptr().cast(), + in_size as i32, + buf.as_mut_ptr(), + 2, + ) + }; + if outsize > 0 { + fail_pos += in_size; + found = true; + break; + } + let err_code = std::io::Error::last_os_error().raw_os_error().unwrap_or(0); + if err_code == 1004 && flags_s != 0 { + flags_s = 0; + continue; + } + in_size += 1; + } + if !found { + break; + } + } + let object = vm.ctx.new_bytes(data.to_vec()); + return Err(vm.new_unicode_decode_error_real( + encoding_str, + object, + fail_pos, + fail_pos + 1, + reason_str, + )); + } + + let error_handler = if errors != "strict" + && errors != "ignore" + && errors != "replace" + && errors != "backslashreplace" + && errors != "surrogateescape" + { + Some(vm.state.codec_registry.lookup_error(errors, vm)?) + } else { + None }; - if result == 0 { - let err = std::io::Error::last_os_error(); - return Err(vm.new_os_error(format!("code_page_decode failed: {err}"))); + + let mut wide_buf: Vec = Vec::new(); + let mut pos = 0usize; + let mut flags: u32 = MB_ERR_INVALID_CHARS; + + while pos < len { + // Try to decode with increasing byte counts (1, 2, 3, 4) + let mut in_size = 1; + let mut outsize; + let mut buffer = [0u16; 2]; + + loop { + outsize = unsafe { + MultiByteToWideChar( + code_page, + flags, + data[pos..].as_ptr().cast(), + in_size as i32, + buffer.as_mut_ptr(), + 2, + ) + }; + if outsize > 0 { + break; + } + let err_code = std::io::Error::last_os_error().raw_os_error().unwrap_or(0); + if err_code == 1004 && flags != 0 { + // ERROR_INVALID_FLAGS - retry with flags=0 + flags = 0; + continue; + } + if err_code != 1113 && err_code != 122 { + // Not ERROR_NO_UNICODE_TRANSLATION and not ERROR_INSUFFICIENT_BUFFER + let err = std::io::Error::last_os_error(); + return Err(vm.new_os_error(format!("code_page_decode: {err}"))); + } + in_size += 1; + if in_size > 4 || pos + in_size > len { + break; + } + } + + if outsize <= 0 { + // Can't decode this byte sequence + if pos + in_size >= len && !is_final { + // Incomplete sequence at end, not final - stop here + break; + } + + // Handle the error based on error mode + match errors { + "ignore" => { + pos += 1; + } + "replace" => { + wide_buf.push(0xFFFD); + pos += 1; + } + "backslashreplace" => { + let byte = data[pos]; + for ch in format!("\\x{byte:02x}").encode_utf16() { + wide_buf.push(ch); + } + pos += 1; + } + "surrogateescape" => { + let byte = data[pos]; + wide_buf.push(0xDC00 + byte as u16); + pos += 1; + } + "strict" => { + let object = vm.ctx.new_bytes(data.to_vec()); + return Err(vm.new_unicode_decode_error_real( + encoding_str, + object, + pos, + pos + 1, + reason_str, + )); + } + _ => { + // Custom error handler + let object = vm.ctx.new_bytes(data.to_vec()); + let exc = vm.new_unicode_decode_error_real( + encoding_str.clone(), + object, + pos, + pos + 1, + reason_str.clone(), + ); + let handler = error_handler.as_ref().unwrap(); + let res = handler.call((exc,), vm)?; + let tuple_err = || { + vm.new_type_error("decoding error handler must return (str, int) tuple") + }; + let tuple: &PyTuple = res.downcast_ref().ok_or_else(&tuple_err)?; + let tuple_slice = tuple.as_slice(); + if tuple_slice.len() != 2 { + return Err(tuple_err()); + } + + let replacement: PyStrRef = tuple_slice[0] + .clone() + .try_into_value(vm) + .map_err(|_| tuple_err())?; + let new_pos: isize = tuple_slice[1] + .clone() + .try_into_value(vm) + .map_err(|_| tuple_err())?; + + for cp in replacement.as_wtf8().code_points() { + let u = cp.to_u32(); + if u < 0x10000 { + wide_buf.push(u as u16); + } else { + wide_buf.push(((u - 0x10000) >> 10) as u16 + 0xD800); + wide_buf.push(((u - 0x10000) & 0x3FF) as u16 + 0xDC00); + } + } + + pos = if new_pos < 0 { + (len as isize + new_pos).max(0) as usize + } else { + new_pos as usize + }; + } + } + } else { + // Successfully decoded + wide_buf.extend_from_slice(&buffer[..outsize as usize]); + pos += in_size; + } } - buffer.truncate(result as usize); - let s = String::from_utf16(&buffer) - .map_err(|e| vm.new_unicode_decode_error(format!("code_page_decode failed: {e}")))?; - Ok((s, len)) + let s = Wtf8Buf::from_wide(&wide_buf); + Ok((vm.ctx.new_str(s), pos)) + } + + #[pyfunction] + fn code_page_decode( + args: CodePageDecodeArgs, + vm: &VirtualMachine, + ) -> PyResult<(PyStrRef, usize)> { + use crate::common::wtf8::Wtf8Buf; + + if args.code_page < 0 { + return Err(vm.new_value_error("invalid code page number".to_owned())); + } + let errors = args.errors.as_ref().map(|s| s.as_str()).unwrap_or("strict"); + let code_page = args.code_page as u32; + let data = args.data.borrow_buf(); + let is_final = args.r#final; + + if data.is_empty() { + return Ok((vm.ctx.empty_str.to_owned(), 0)); + } + + let encoding_name = code_page_encoding_name(code_page); + + // Fast path: try to decode the whole buffer with strict flags + match try_decode_code_page_strict(code_page, &data, vm)? { + Some(wide) => { + let s = Wtf8Buf::from_wide(&wide); + return Ok((vm.ctx.new_str(s), data.len())); + } + None => { + // Decode error - fall through to slow path + } + } + + // Slow path: byte by byte with error handling + decode_code_page_errors(code_page, &data, errors, is_final, &encoding_name, vm) } } diff --git a/crates/vm/src/stdlib/io.rs b/crates/vm/src/stdlib/io.rs index b270fa2529b..5409d68636f 100644 --- a/crates/vm/src/stdlib/io.rs +++ b/crates/vm/src/stdlib/io.rs @@ -21,7 +21,7 @@ cfg_if::cfg_if! { } use crate::{ - PyObjectRef, PyResult, TryFromObject, VirtualMachine, + AsObject, PyObject, PyObjectRef, PyResult, TryFromObject, VirtualMachine, builtins::{PyBaseExceptionRef, PyModule}, common::os::ErrorExt, convert::{IntoPyException, ToPyException}, @@ -91,6 +91,38 @@ impl IntoPyException for std::io::Error { } } +fn file_closed(file: &PyObject, vm: &VirtualMachine) -> PyResult { + file.get_attr("closed", vm)?.try_to_bool(vm) +} + +/// iobase_finalize in Modules/_io/iobase.c +fn iobase_finalize(zelf: &PyObject, vm: &VirtualMachine) { + // If `closed` doesn't exist or can't be evaluated as bool, then the + // object is probably in an unusable state, so ignore. + let closed = match vm.get_attribute_opt(zelf.to_owned(), "closed") { + Ok(Some(val)) => match val.try_to_bool(vm) { + Ok(b) => b, + Err(_) => return, + }, + _ => return, + }; + if !closed { + // Signal close() that it was called as part of the object + // finalization process. + let _ = zelf.set_attr("_finalizing", vm.ctx.true_value.clone(), vm); + if let Err(e) = vm.call_method(zelf, "close", ()) { + // BrokenPipeError during GC finalization is expected when pipe + // buffer objects are collected after the subprocess dies. The + // underlying fd is still properly closed by raw.close(). + // Popen.__del__ catches BrokenPipeError, but our tracing GC may + // finalize pipe buffers before Popen.__del__ runs. + if !e.fast_isinstance(vm.ctx.exceptions.broken_pipe_error) { + vm.run_unraisable(e, None, zelf.to_owned()); + } + } + } +} + // not used on all platforms #[derive(Copy, Clone)] #[repr(transparent)] @@ -395,10 +427,6 @@ mod _io { } } - fn file_closed(file: &PyObject, vm: &VirtualMachine) -> PyResult { - file.get_attr("closed", vm)?.try_to_bool(vm) - } - fn check_closed(file: &PyObject, vm: &VirtualMachine) -> PyResult<()> { if file_closed(file, vm)? { Err(io_closed_error(vm)) @@ -618,6 +646,11 @@ mod _io { impl Destructor for _IOBase { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { + // C-level IO types (FileIO, Buffered*, TextIOWrapper) have their own + // slot_del that calls iobase_finalize with proper _finalizing flag + // and _dealloc_warn chain. This base fallback is only reached by + // Python-level subclasses, where we silently discard close() errors + // to avoid surfacing unraisable from partially initialized objects. let _ = vm.call_method(zelf, "close", ()); Ok(()) } @@ -1580,7 +1613,7 @@ mod _io { } #[pyclass] - trait BufferedMixin: PyPayload { + trait BufferedMixin: PyPayload + StaticType { const CLASS_NAME: &'static str; const READABLE: bool; const WRITABLE: bool; @@ -1588,6 +1621,7 @@ mod _io { fn data(&self) -> &PyThreadMutex; fn closing(&self) -> &AtomicBool; + fn finalizing(&self) -> &AtomicBool; fn lock(&self, vm: &VirtualMachine) -> PyResult> { self.data() @@ -1797,6 +1831,10 @@ mod _io { } raw.to_owned() }; + if zelf.finalizing().load(Ordering::Relaxed) { + // _dealloc_warn: delegate to raw._dealloc_warn(source) + let _ = vm.call_method(&raw, "_dealloc_warn", (zelf.as_object().to_owned(),)); + } // Set closing flag so that concurrent write() calls will fail zelf.closing().store(true, Ordering::Release); let flush_res = vm.call_method(zelf.as_object(), "flush", ()).map(drop); @@ -1818,6 +1856,34 @@ mod _io { fn __getstate__(zelf: PyObjectRef, vm: &VirtualMachine) -> PyResult { Err(vm.new_type_error(format!("cannot pickle '{}' instances", zelf.class().name()))) } + + #[pymethod] + fn __reduce_ex__(zelf: PyObjectRef, proto: usize, vm: &VirtualMachine) -> PyResult { + if zelf.class().is(Self::static_type()) { + return Err( + vm.new_type_error(format!("cannot pickle '{}' object", zelf.class().name())) + ); + } + let _ = proto; + reduce_ex_for_subclass(zelf, vm) + } + + #[pymethod] + fn _dealloc_warn( + zelf: PyRef, + source: PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult<()> { + // Get raw reference and release lock before calling downstream + let raw = { + let data = zelf.lock(vm)?; + data.raw.clone() + }; + if let Some(raw) = raw { + let _ = vm.call_method(&raw, "_dealloc_warn", (source,)); + } + Ok(()) + } } #[pyclass] @@ -1931,6 +1997,7 @@ mod _io { _base: _BufferedIOBase, data: PyThreadMutex, closing: AtomicBool, + finalizing: AtomicBool, } impl BufferedMixin for BufferedReader { @@ -1945,6 +2012,10 @@ mod _io { fn closing(&self) -> &AtomicBool { &self.closing } + + fn finalizing(&self) -> &AtomicBool { + &self.finalizing + } } impl BufferedReadable for BufferedReader { @@ -1963,7 +2034,10 @@ mod _io { impl Destructor for BufferedReader { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + if let Some(buf) = zelf.downcast_ref::() { + buf.finalizing.store(true, Ordering::Relaxed); + } + iobase_finalize(zelf, vm); Ok(()) } @@ -2027,6 +2101,7 @@ mod _io { _base: _BufferedIOBase, data: PyThreadMutex, closing: AtomicBool, + finalizing: AtomicBool, } impl BufferedMixin for BufferedWriter { @@ -2041,6 +2116,10 @@ mod _io { fn closing(&self) -> &AtomicBool { &self.closing } + + fn finalizing(&self) -> &AtomicBool { + &self.finalizing + } } impl BufferedWritable for BufferedWriter { @@ -2059,7 +2138,10 @@ mod _io { impl Destructor for BufferedWriter { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + if let Some(buf) = zelf.downcast_ref::() { + buf.finalizing.store(true, Ordering::Relaxed); + } + iobase_finalize(zelf, vm); Ok(()) } @@ -2078,6 +2160,7 @@ mod _io { _base: _BufferedIOBase, data: PyThreadMutex, closing: AtomicBool, + finalizing: AtomicBool, } impl BufferedMixin for BufferedRandom { @@ -2093,6 +2176,10 @@ mod _io { fn closing(&self) -> &AtomicBool { &self.closing } + + fn finalizing(&self) -> &AtomicBool { + &self.finalizing + } } impl BufferedReadable for BufferedRandom { @@ -2125,7 +2212,10 @@ mod _io { impl Destructor for BufferedRandom { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + if let Some(buf) = zelf.downcast_ref::() { + buf.finalizing.store(true, Ordering::Relaxed); + } + iobase_finalize(zelf, vm); Ok(()) } @@ -2229,7 +2319,7 @@ mod _io { impl Destructor for BufferedRWPair { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + iobase_finalize(zelf, vm); Ok(()) } @@ -2246,14 +2336,14 @@ mod _io { #[pyarg(any, default)] errors: Option, #[pyarg(any, default)] - newline: Option, + newline: OptionalOption, #[pyarg(any, default)] - line_buffering: Option, + line_buffering: OptionalOption, #[pyarg(any, default)] - write_through: Option, + write_through: OptionalOption, } - #[derive(Debug, Copy, Clone, Default)] + #[derive(Debug, Copy, Clone, Default, PartialEq)] enum Newlines { #[default] Universal, @@ -2284,7 +2374,7 @@ mod _io { }) .ok_or(len) } - Self::Cr => s.find("\n".as_ref()).map(|p| p + 1).ok_or(len), + Self::Cr => s.find("\r".as_ref()).map(|p| p + 1).ok_or(len), Self::Crlf => { // s[searched..] == remaining let mut searched = 0; @@ -2323,7 +2413,13 @@ mod _io { obj.class().name() )) })?; - match s.as_str() { + let wtf8 = s.as_wtf8(); + if !wtf8.is_utf8() { + let repr = s.repr(vm)?.as_str().to_owned(); + return Err(vm.new_value_error(format!("illegal newline value: {repr}"))); + } + let s_str = wtf8.as_str().expect("checked utf8"); + match s_str { "" => Self::Passthrough, "\n" => Self::Lf, "\r" => Self::Cr, @@ -2335,6 +2431,22 @@ mod _io { } } + fn reduce_ex_for_subclass(zelf: PyObjectRef, vm: &VirtualMachine) -> PyResult { + let cls = zelf.class(); + let new = vm + .get_attribute_opt(cls.to_owned().into(), "__new__")? + .ok_or_else(|| vm.new_attribute_error("type has no attribute '__new__'"))?; + let args = vm.ctx.new_tuple(vec![cls.to_owned().into()]); + let state = if let Some(getstate) = vm.get_attribute_opt(zelf.clone(), "__getstate__")? { + getstate.call((), vm)? + } else if let Ok(dict) = zelf.get_attr("__dict__", vm) { + dict + } else { + vm.ctx.none() + }; + Ok(vm.ctx.new_tuple(vec![new, args.into(), state]).into()) + } + /// A length of or index into a UTF-8 string, measured in both chars and bytes #[derive(Debug, Default, Copy, Clone)] struct Utf8size { @@ -2572,6 +2684,7 @@ mod _io { struct TextIOWrapper { _base: _TextIOBase, data: PyThreadMutex>, + finalizing: AtomicBool, } impl DefaultConstructor for TextIOWrapper {} @@ -2587,41 +2700,37 @@ mod _io { let mut data = zelf.lock_opt(vm)?; *data = None; - let encoding = match args.encoding { - None if vm.state.config.settings.utf8_mode > 0 => { - identifier_utf8!(vm, utf_8).to_owned() - } - Some(enc) if enc.as_str() != "locale" => { - // Check for embedded null character - if enc.as_str().contains('\0') { - return Err(cstring_error(vm)); - } - enc - } - _ => { - // None without utf8_mode or "locale" encoding - vm.import("locale", 0)? - .get_attr("getencoding", vm)? - .call((), vm)? - .try_into_value(vm)? - } - }; + let encoding = Self::resolve_encoding(args.encoding, vm)?; let errors = args .errors .unwrap_or_else(|| identifier!(vm, strict).to_owned()); - - // Check for embedded null character in errors (use as_wtf8 to handle surrogates) - if errors.as_wtf8().as_bytes().contains(&0) { - return Err(cstring_error(vm)); - } + Self::validate_errors(&errors, vm)?; let has_read1 = vm.get_attribute_opt(buffer.clone(), "read1")?.is_some(); let seekable = vm.call_method(&buffer, "seekable", ())?.try_to_bool(vm)?; - let newline = args.newline.unwrap_or_default(); + let newline = match args.newline { + OptionalArg::Missing => Newlines::default(), + OptionalArg::Present(None) => Newlines::default(), + OptionalArg::Present(Some(newline)) => newline, + }; let (encoder, decoder) = Self::find_coder(&buffer, encoding.as_str(), &errors, newline, vm)?; + if let Some((encoder, _)) = &encoder { + Self::adjust_encoder_state_for_bom(encoder, encoding.as_str(), &buffer, vm)?; + } + + let line_buffering = match args.line_buffering { + OptionalArg::Missing => false, + OptionalArg::Present(None) => false, + OptionalArg::Present(Some(value)) => value.try_to_bool(vm)?, + }; + let write_through = match args.write_through { + OptionalArg::Missing => false, + OptionalArg::Present(None) => false, + OptionalArg::Present(Some(value)) => value.try_to_bool(vm)?, + }; *data = Some(TextIOData { buffer, @@ -2630,8 +2739,8 @@ mod _io { encoding, errors, newline, - line_buffering: args.line_buffering.unwrap_or_default(), - write_through: args.write_through.unwrap_or_default(), + line_buffering, + write_through, chunk_size: 8192, seekable, has_read1, @@ -2646,6 +2755,16 @@ mod _io { Ok(()) } + + fn slot_init(zelf: PyObjectRef, args: FuncArgs, vm: &VirtualMachine) -> PyResult<()> { + let zelf_ref: PyRef = zelf.try_into_value(vm)?; + { + let mut data = zelf_ref.lock_opt(vm)?; + *data = None; + } + let (buffer, text_args): (PyObjectRef, TextIOWrapperArgs) = args.bind(vm)?; + Self::init(zelf_ref, (buffer, text_args), vm) + } } impl TextIOWrapper { @@ -2664,6 +2783,108 @@ mod _io { .map_err(|_| vm.new_value_error("I/O operation on uninitialized object")) } + fn validate_errors(errors: &PyStrRef, vm: &VirtualMachine) -> PyResult<()> { + if errors.as_wtf8().as_bytes().contains(&0) { + return Err(cstring_error(vm)); + } + if !errors.as_wtf8().is_utf8() { + return Err(vm.new_unicode_encode_error( + "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), + )); + } + vm.state + .codec_registry + .lookup_error(errors.as_str(), vm) + .map(drop) + } + + fn bool_from_index(value: PyObjectRef, vm: &VirtualMachine) -> PyResult { + let int = value.try_index(vm)?; + let value: i32 = int.try_to_primitive(vm)?; + Ok(value != 0) + } + + fn resolve_encoding( + encoding: Option, + vm: &VirtualMachine, + ) -> PyResult { + if encoding.is_none() && vm.state.config.settings.warn_default_encoding { + crate::stdlib::warnings::warn( + vm.ctx.exceptions.encoding_warning, + "'encoding' argument not specified".to_owned(), + 1, + vm, + )?; + } + let encoding = match encoding { + None if vm.state.config.settings.utf8_mode > 0 => { + identifier_utf8!(vm, utf_8).to_owned() + } + Some(enc) if enc.as_str() == "locale" => match vm.import("locale", 0) { + Ok(locale) => locale + .get_attr("getencoding", vm)? + .call((), vm)? + .try_into_value(vm)?, + Err(err) + if err.fast_isinstance(vm.ctx.exceptions.import_error) + || err.fast_isinstance(vm.ctx.exceptions.module_not_found_error) => + { + identifier_utf8!(vm, utf_8).to_owned() + } + Err(err) => return Err(err), + }, + Some(enc) => { + if enc.as_str().contains('\0') { + return Err(cstring_error(vm)); + } + enc + } + _ => match vm.import("locale", 0) { + Ok(locale) => locale + .get_attr("getencoding", vm)? + .call((), vm)? + .try_into_value(vm)?, + Err(err) + if err.fast_isinstance(vm.ctx.exceptions.import_error) + || err.fast_isinstance(vm.ctx.exceptions.module_not_found_error) => + { + identifier_utf8!(vm, utf_8).to_owned() + } + Err(err) => return Err(err), + }, + }; + if encoding.as_str().contains('\0') { + return Err(cstring_error(vm)); + } + Ok(encoding) + } + + fn adjust_encoder_state_for_bom( + encoder: &PyObjectRef, + encoding: &str, + buffer: &PyObject, + vm: &VirtualMachine, + ) -> PyResult<()> { + let needs_bom = matches!(encoding, "utf-8-sig" | "utf-16" | "utf-32"); + if !needs_bom { + return Ok(()); + } + let seekable = vm.call_method(buffer, "seekable", ())?.try_to_bool(vm)?; + if !seekable { + return Ok(()); + } + let pos = vm.call_method(buffer, "tell", ())?; + if vm.bool_eq(&pos, vm.ctx.new_int(0).as_ref())? { + return Ok(()); + } + if let Err(err) = vm.call_method(encoder, "setstate", (0,)) + && !err.fast_isinstance(vm.ctx.exceptions.attribute_error) + { + return Err(err); + } + Ok(()) + } + #[allow(clippy::type_complexity)] fn find_coder( buffer: &PyObject, @@ -2676,6 +2897,11 @@ mod _io { Option, )> { let codec = vm.state.codec_registry.lookup(encoding, vm)?; + if !codec.is_text_codec(vm)? { + return Err(vm.new_lookup_error(format!( + "'{encoding}' is not a text encoding; use codecs.open() to handle arbitrary codecs" + ))); + } let encoder = if vm.call_method(buffer, "writable", ())?.try_to_bool(vm)? { let incremental_encoder = @@ -2734,33 +2960,102 @@ mod _io { impl TextIOWrapper { #[pymethod] fn reconfigure(&self, args: TextIOWrapperArgs, vm: &VirtualMachine) -> PyResult<()> { - let mut data = self.data.lock().unwrap(); - if let Some(data) = data.as_mut() { - if let Some(encoding) = args.encoding { - let (encoder, decoder) = Self::find_coder( - &data.buffer, - encoding.as_str(), - &data.errors, - data.newline, - vm, - )?; - data.encoding = encoding; - data.encoder = encoder; - data.decoder = decoder; - } - if let Some(errors) = args.errors { - data.errors = errors; + let mut data = self.lock(vm)?; + data.check_closed(vm)?; + + let mut encoding = data.encoding.clone(); + let mut errors = data.errors.clone(); + let mut newline = data.newline; + let mut encoding_changed = false; + let mut errors_changed = false; + let mut newline_changed = false; + let mut line_buffering = None; + let mut write_through = None; + let mut flush_on_reconfigure = false; + + if let Some(enc) = args.encoding { + if enc.as_str().contains('\0') && enc.as_str().starts_with("locale") { + return Err(vm.new_lookup_error(format!("unknown encoding: {enc}"))); } - if let Some(newline) = args.newline { - data.newline = newline; + let resolved = Self::resolve_encoding(Some(enc), vm)?; + encoding_changed = resolved.as_str() != encoding.as_str(); + encoding = resolved; + } + + if let Some(errs) = args.errors { + Self::validate_errors(&errs, vm)?; + errors_changed = errs.as_str() != errors.as_str(); + errors = errs; + } else if encoding_changed { + errors = identifier!(vm, strict).to_owned(); + errors_changed = true; + } + + if let OptionalArg::Present(nl) = args.newline { + let nl = nl.unwrap_or_default(); + newline_changed = nl != newline; + newline = nl; + } + + if let OptionalArg::Present(Some(value)) = args.line_buffering { + flush_on_reconfigure = true; + line_buffering = Some(Self::bool_from_index(value, vm)?); + } + if let OptionalArg::Present(Some(value)) = args.write_through { + flush_on_reconfigure = true; + write_through = Some(Self::bool_from_index(value, vm)?); + } + + if (encoding_changed || newline_changed) + && data.decoder.is_some() + && (data.decoded_chars.is_some() + || data.snapshot.is_some() + || data.decoded_chars_used.chars != 0) + { + return Err(new_unsupported_operation( + vm, + "cannot reconfigure encoding or newline after reading from the stream" + .to_owned(), + )); + } + + if flush_on_reconfigure { + if data.pending.num_bytes > 0 { + data.write_pending(vm)?; } - if let Some(line_buffering) = args.line_buffering { - data.line_buffering = line_buffering; + vm.call_method(&data.buffer, "flush", ())?; + } + + if encoding_changed || errors_changed || newline_changed { + if data.pending.num_bytes > 0 { + data.write_pending(vm)?; } - if let Some(write_through) = args.write_through { - data.write_through = write_through; + let (encoder, decoder) = + Self::find_coder(&data.buffer, encoding.as_str(), &errors, newline, vm)?; + data.encoding = encoding; + data.errors = errors; + data.newline = newline; + data.encoder = encoder; + data.decoder = decoder; + data.set_decoded_chars(None); + data.snapshot = None; + data.decoded_chars_used = Utf8size::default(); + if let Some((encoder, _)) = &data.encoder { + Self::adjust_encoder_state_for_bom( + encoder, + data.encoding.as_str(), + &data.buffer, + vm, + )?; } } + + if let Some(line_buffering) = line_buffering { + data.line_buffering = line_buffering; + } + if let Some(write_through) = write_through { + data.write_through = write_through; + } Ok(()) } @@ -3197,12 +3492,34 @@ mod _io { } })? }; - if textio.pending.num_bytes + chunk.as_bytes().len() > textio.chunk_size { - textio.write_pending(vm)?; + if textio.pending.num_bytes > 0 + && textio.pending.num_bytes + chunk.as_bytes().len() > textio.chunk_size + { + let buffer = textio.buffer.clone(); + let pending = textio.pending.take(vm); + drop(textio); + vm.call_method(&buffer, "write", (pending,))?; + textio = self.lock(vm)?; + textio.check_closed(vm)?; + if textio.pending.num_bytes > 0 { + let buffer = textio.buffer.clone(); + let pending = textio.pending.take(vm); + drop(textio); + vm.call_method(&buffer, "write", (pending,))?; + textio = self.lock(vm)?; + textio.check_closed(vm)?; + } } textio.pending.push(chunk); - if flush || textio.write_through || textio.pending.num_bytes >= textio.chunk_size { - textio.write_pending(vm)?; + if textio.pending.num_bytes > 0 + && (flush || textio.write_through || textio.pending.num_bytes >= textio.chunk_size) + { + let buffer = textio.buffer.clone(); + let pending = textio.pending.take(vm); + drop(textio); + vm.call_method(&buffer, "write", (pending,))?; + textio = self.lock(vm)?; + textio.check_closed(vm)?; } if flush { let _ = vm.call_method(&textio.buffer, "flush", ()); @@ -3418,6 +3735,10 @@ mod _io { if file_closed(&buffer, vm)? { return Ok(()); } + if zelf.finalizing.load(Ordering::Relaxed) { + // _dealloc_warn: delegate to buffer._dealloc_warn(source) + let _ = vm.call_method(&buffer, "_dealloc_warn", (zelf.as_object().to_owned(),)); + } let flush_res = vm.call_method(zelf.as_object(), "flush", ()).map(drop); let close_res = vm.call_method(&buffer, "close", ()).map(drop); exception_chain(flush_res, close_res) @@ -3438,6 +3759,17 @@ mod _io { fn __getstate__(zelf: PyObjectRef, vm: &VirtualMachine) -> PyResult { Err(vm.new_type_error(format!("cannot pickle '{}' instances", zelf.class().name()))) } + + #[pymethod] + fn __reduce_ex__(zelf: PyObjectRef, proto: usize, vm: &VirtualMachine) -> PyResult { + if zelf.class().is(TextIOWrapper::static_type()) { + return Err( + vm.new_type_error(format!("cannot pickle '{}' object", zelf.class().name())) + ); + } + let _ = proto; + reduce_ex_for_subclass(zelf, vm) + } } fn parse_decoder_state(state: PyObjectRef, vm: &VirtualMachine) -> PyResult<(PyBytesRef, i32)> { @@ -3595,7 +3927,10 @@ mod _io { impl Destructor for TextIOWrapper { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + if let Some(wrapper) = zelf.downcast_ref::() { + wrapper.finalizing.store(true, Ordering::Relaxed); + } + iobase_finalize(zelf, vm); Ok(()) } @@ -3609,6 +3944,11 @@ mod _io { #[inline] fn repr_str(zelf: &Py, vm: &VirtualMachine) -> PyResult { let type_name = zelf.class().slot_name(); + let Some(_guard) = ReprGuard::enter(vm, zelf.as_object()) else { + return Err( + vm.new_runtime_error(format!("reentrant call inside {type_name}.__repr__")) + ); + }; let Some(data) = zelf.data.lock() else { // Reentrant call return Ok(format!("<{type_name}>")); @@ -3620,17 +3960,22 @@ mod _io { let mut result = format!("<{type_name}"); // Add name if present - if let Ok(Some(name)) = vm.get_attribute_opt(data.buffer.clone(), "name") - && let Ok(name_repr) = name.repr(vm) - { + if let Ok(Some(name)) = vm.get_attribute_opt(data.buffer.clone(), "name") { + let name_repr = name.repr(vm)?; result.push_str(" name="); result.push_str(name_repr.as_str()); } - // Add mode if present - if let Ok(Some(mode)) = vm.get_attribute_opt(data.buffer.clone(), "mode") - && let Ok(mode_repr) = mode.repr(vm) - { + // Add mode if present (prefer the wrapper's attribute) + let mode_obj = match vm.get_attribute_opt(zelf.as_object().to_owned(), "mode") { + Ok(Some(mode)) => Some(mode), + Ok(None) | Err(_) => match vm.get_attribute_opt(data.buffer.clone(), "mode") { + Ok(Some(mode)) => Some(mode), + _ => None, + }, + }; + if let Some(mode) = mode_obj { + let mode_repr = mode.repr(vm)?; result.push_str(" mode="); result.push_str(mode_repr.as_str()); } @@ -4266,10 +4611,8 @@ mod _io { } #[pymethod] - fn close(&self, vm: &VirtualMachine) -> PyResult<()> { - drop(self.try_resizable(vm)?); + fn close(&self) { self.closed.store(true); - Ok(()) } #[pymethod] @@ -4614,7 +4957,10 @@ mod _io { if buffering == 0 { let ret = match mode.encode { - EncodeMode::Text => Err(vm.new_value_error("can't have unbuffered text I/O")), + EncodeMode::Text => { + let _ = vm.call_method(&raw, "close", ()); + Err(vm.new_value_error("can't have unbuffered text I/O")) + } EncodeMode::Bytes => Ok(raw), }; return ret; @@ -4631,19 +4977,29 @@ mod _io { match mode.encode { EncodeMode::Text => { + let encoding = match opts.encoding { + Some(enc) => Some(enc), + None => { + let encoding = text_encoding(vm.ctx.none(), OptionalArg::Present(2), vm)?; + Some(PyUtf8StrRef::try_from_object(vm, encoding.into())?) + } + }; let tio = TextIOWrapper::static_type(); let wrapper = PyType::call( tio, ( - buffered, - opts.encoding, + buffered.clone(), + encoding, opts.errors, opts.newline, line_buffering, ) .into_args(vm), vm, - )?; + ) + .inspect_err(|_err| { + let _ = vm.call_method(&buffered, "close", ()); + })?; wrapper.set_attr("mode", vm.new_pyobj(mode_string), vm)?; Ok(wrapper) } @@ -4677,12 +5033,35 @@ mod _io { #[pyfunction] fn text_encoding( encoding: PyObjectRef, - _stacklevel: OptionalArg, + stacklevel: OptionalArg, vm: &VirtualMachine, ) -> PyResult { if vm.is_none(&encoding) { - // TODO: This is `locale` encoding - but we don't have locale encoding yet - return Ok(vm.ctx.new_str("utf-8")); + let encoding = if vm.state.config.settings.utf8_mode > 0 { + "utf-8" + } else { + "locale" + }; + if vm.state.config.settings.warn_default_encoding { + let mut stacklevel = stacklevel.unwrap_or(2); + if stacklevel > 1 + && let Some(frame) = vm.current_frame() + && let Some(stdlib_dir) = vm.state.config.paths.stdlib_dir.as_deref() + { + let path = frame.code.source_path.as_str(); + if !path.starts_with(stdlib_dir) { + stacklevel = stacklevel.saturating_sub(1); + } + } + let stacklevel = usize::try_from(stacklevel).unwrap_or(0); + crate::stdlib::warnings::warn( + vm.ctx.exceptions.encoding_warning, + "'encoding' argument not specified".to_owned(), + stacklevel, + vm, + )?; + } + return Ok(vm.ctx.new_str(encoding)); } encoding.try_into_value(vm) } @@ -4745,7 +5124,7 @@ mod _io { #[cfg(any(not(target_arch = "wasm32"), target_os = "wasi"))] #[pymodule] mod fileio { - use super::{_io::*, Offset}; + use super::{_io::*, Offset, iobase_finalize}; use crate::{ AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromObject, VirtualMachine, @@ -4872,6 +5251,7 @@ mod fileio { mode: AtomicCell, seekable: AtomicCell>, blksize: AtomicCell, + finalizing: AtomicCell, } #[derive(FromArgs)] @@ -4895,6 +5275,7 @@ mod fileio { mode: AtomicCell::new(Mode::empty()), seekable: AtomicCell::new(None), blksize: AtomicCell::new(8 * 1024), // DEFAULT_BUFFER_SIZE + finalizing: AtomicCell::new(false), } } } @@ -4978,6 +5359,12 @@ mod fileio { #[cfg(windows)] { if let Err(err) = fd_fstat { + // If the fd is invalid, prevent destructor from trying to close it + if err.raw_os_error() + == Some(windows_sys::Win32::Foundation::ERROR_INVALID_HANDLE as i32) + { + zelf.fd.store(-1); + } return Err(OSErrorBuilder::with_filename(&err, filename, vm)); } } @@ -5001,10 +5388,8 @@ mod fileio { } Err(err) => { if err.raw_os_error() == Some(libc::EBADF) { - // If fd was passed by user, don't close it on error - if !fd_is_own { - zelf.fd.store(-1); - } + // fd is invalid, prevent destructor from trying to close it + zelf.fd.store(-1); return Err(OSErrorBuilder::with_filename(&err, filename, vm)); } } @@ -5267,12 +5652,26 @@ mod fileio { zelf.fd.store(-1); return res; } + let flush_exc = res.err(); + if zelf.finalizing.load() { + Self::dealloc_warn(zelf, zelf.as_object().to_owned(), vm); + } let fd = zelf.fd.swap(-1); - if fd >= 0 { + let close_err = if fd >= 0 { crt_fd::close(unsafe { crt_fd::Owned::from_raw(fd) }) - .map_err(|err| Self::io_error(zelf, err, vm))?; + .map_err(|err| Self::io_error(zelf, err, vm)) + .err() + } else { + None + }; + match (flush_exc, close_err) { + (Some(fe), Some(ce)) => { + ce.set___context__(Some(fe)); + Err(ce) + } + (Some(e), None) | (None, Some(e)) => Err(e), + (None, None) => Ok(()), } - res } #[pymethod] @@ -5326,11 +5725,45 @@ mod fileio { fn __getstate__(zelf: PyObjectRef, vm: &VirtualMachine) -> PyResult { Err(vm.new_type_error(format!("cannot pickle '{}' instances", zelf.class().name()))) } + + /// fileio_dealloc_warn in Modules/_io/fileio.c + #[pymethod(name = "_dealloc_warn")] + fn _dealloc_warn_method( + zelf: &Py, + source: PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult<()> { + Self::dealloc_warn(zelf, source, vm); + Ok(()) + } + } + + impl FileIO { + /// Issue ResourceWarning if fd is still open and closefd is true. + fn dealloc_warn(zelf: &Py, source: PyObjectRef, vm: &VirtualMachine) { + if zelf.fd.load() >= 0 && zelf.closefd.load() { + let repr = source + .repr(vm) + .map(|s| s.as_str().to_owned()) + .unwrap_or_else(|_| "".to_owned()); + if let Err(e) = crate::stdlib::warnings::warn( + vm.ctx.exceptions.resource_warning, + format!("unclosed file {repr}"), + 1, + vm, + ) { + vm.run_unraisable(e, None, zelf.as_object().to_owned()); + } + } + } } impl Destructor for FileIO { fn slot_del(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<()> { - let _ = vm.call_method(zelf, "close", ()); + if let Some(fileio) = zelf.downcast_ref::() { + fileio.finalizing.store(true); + } + iobase_finalize(zelf, vm); Ok(()) } diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index 5ea333a5760..1abfa20054b 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -758,7 +758,7 @@ impl VirtualMachine { } } - /// Phase 4: Clear module dicts. + /// Phase 4: Clear module dicts in reverse import order using 2-pass algorithm. /// Without GC, only clear __main__ — other modules' __del__ handlers /// need their globals intact. CPython can clear ALL module dicts because /// _PyGC_CollectNoFail() finalizes cycle-participating objects beforehand. diff --git a/src/settings.rs b/src/settings.rs index 1847e22c2d4..059216e5f92 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -269,6 +269,21 @@ pub fn parse_opts() -> Result<(Settings, RunMode), lexopt::Error> { "dev" => settings.dev_mode = true, "faulthandler" => settings.faulthandler = true, "warn_default_encoding" => settings.warn_default_encoding = true, + "utf8" => { + settings.utf8_mode = match value { + None => 1, + Some("1") => 1, + Some("0") => 0, + _ => { + error!( + "Fatal Python error: config_init_utf8_mode: \ + -X utf8=n: n is missing or invalid\n\ + Python runtime state: preinitialized" + ); + std::process::exit(1); + } + }; + } "no_sig_int" => settings.install_signal_handlers = false, "no_debug_ranges" => settings.code_debug_ranges = false, "int_max_str_digits" => {