From 67ab5ee51a599fee508c883f5b7f8d0b4b1749e4 Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 01:39:31 -0400 Subject: [PATCH 1/6] enable more Ruff code suggestions --- ftfy/__init__.py | 2 +- ftfy/chardata.py | 6 ++++-- pyproject.toml | 9 +++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index cc0a120..880fb05 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -531,7 +531,7 @@ def _fix_encoding_one_step_and_explain( decoding = "utf-8-variants" decode_step = ExplanationStep("decode", decoding) - steps = [encode_step] + transcode_steps + [decode_step] + steps = [encode_step, *transcode_steps, decode_step] fixed = encoded_bytes.decode(decoding) return ExplainedText(fixed, steps) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index afcc767..ea199d0 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -43,7 +43,7 @@ def _build_regexes() -> dict[str, re.Pattern[str]]: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. - byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) + byte_range = bytes([*range(0x80, 0x100), 0x1A]) charlist = byte_range.decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B @@ -686,6 +686,8 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ - """.format(**UTF8_CLUES), + """.format( + **UTF8_CLUES + ), re.VERBOSE, ) diff --git a/pyproject.toml b/pyproject.toml index 9588f31..7b65007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,8 +40,13 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP"] -ignore = ["ANN101", "ANN401"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF"] +ignore = [ + "ANN101", + "ANN401", + "RUF001", # complains about Unicode characters that belong in my docstrings + "RUF002", # complains about Unicode characters that belong in my docstrings +] [tool.ruff.lint.per-file-ignores] "tests/*" = ["ANN"] From 4ecbb33eda53b67426b6f712334f23b899faf20e Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 01:42:59 -0400 Subject: [PATCH 2/6] mypy checks and formatting --- ftfy/__init__.py | 20 +++++--------------- ftfy/bad_codecs/sloppy.py | 2 +- ftfy/bad_codecs/utf8_variants.py | 2 +- ftfy/chardata.py | 4 +--- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 880fb05..4d48fba 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -227,9 +227,7 @@ class TextFixerConfig(NamedTuple): explain: bool = True -def _config_from_kwargs( - config: TextFixerConfig, kwargs: dict[str, Any] -) -> TextFixerConfig: +def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: """ Handle parameters provided as keyword arguments to ftfy's top-level functions, converting them into a TextFixerConfig. @@ -465,9 +463,7 @@ def fix_encoding_and_explain( return ExplainedText(text, plan_so_far) -def _fix_encoding_one_step_and_explain( - text: str, config: TextFixerConfig -) -> ExplainedText: +def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ @@ -513,9 +509,7 @@ def _fix_encoding_one_step_and_explain( ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append( - ExplanationStep("transcode", "restore_byte_a0") - ) + transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost @@ -583,9 +577,7 @@ def _fix_encoding_one_step_and_explain( return ExplainedText(text, []) -def fix_encoding( - text: str, config: TextFixerConfig | None = None, **kwargs: Any -) -> str: +def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -606,9 +598,7 @@ def fix_encoding( ftfy = fix_text -def fix_text_segment( - text: str, config: TextFixerConfig | None = None, **kwargs: Any -) -> str: +def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. diff --git a/ftfy/bad_codecs/sloppy.py b/ftfy/bad_codecs/sloppy.py index 656f01c..8c65e4f 100644 --- a/ftfy/bad_codecs/sloppy.py +++ b/ftfy/bad_codecs/sloppy.py @@ -143,7 +143,7 @@ class StreamReader(Codec, codecs.StreamReader): return codecs.CodecInfo( name="sloppy-" + encoding, encode=Codec().encode, - decode=Codec().decode, + decode=Codec().decode, # type: ignore[arg-type] incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index c15a3cf..57807cc 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -248,7 +248,7 @@ def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: CODEC_INFO = codecs.CodecInfo( name=NAME, encode=StreamWriter.encode, - decode=StreamReader.decode, + decode=StreamReader.decode, # type: ignore[arg-type] incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, diff --git a/ftfy/chardata.py b/ftfy/chardata.py index ea199d0..43d117c 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -686,8 +686,6 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ - """.format( - **UTF8_CLUES - ), + """.format(**UTF8_CLUES), re.VERBOSE, ) From c033c1ace1e92c934f98d59b6e454a663ba683ca Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 02:10:44 -0400 Subject: [PATCH 3/6] incorporate some ruff code suggestions --- ftfy/__init__.py | 19 ++++++---- ftfy/bad_codecs/utf8_variants.py | 60 ++++++++++++++------------------ ftfy/cli.py | 10 +++--- ftfy/formatting.py | 9 +++-- pyproject.toml | 3 +- scripts/char_data_table.py | 8 ++--- tests/test_cli.py | 7 ++-- tests/test_examples_in_json.py | 8 ++--- 8 files changed, 61 insertions(+), 63 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 4d48fba..fb66698 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -9,8 +9,8 @@ import unicodedata import warnings -from collections.abc import Iterator from typing import ( + TYPE_CHECKING, Any, BinaryIO, Callable, @@ -24,6 +24,9 @@ from ftfy.badness import is_bad from ftfy.formatting import display_ljust +if TYPE_CHECKING: + from collections.abc import Iterator + __version__ = "6.3.1" @@ -241,8 +244,7 @@ def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> Text kwargs = kwargs.copy() kwargs["unescape_html"] = kwargs["fix_entities"] del kwargs["fix_entities"] - config = config._replace(**kwargs) - return config + return config._replace(**kwargs) BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. @@ -669,12 +671,13 @@ def guess_bytes(bstring: bytes) -> tuple[str, str]: single-byte encoding. """ if isinstance(bstring, str): - raise UnicodeError( + msg = ( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) + raise UnicodeError(msg) - if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): + if bstring.startswith((b"\xfe\xff", b"\xff\xfe")): return bstring.decode("utf-16"), "utf-16" byteset = set(bstring) @@ -748,9 +751,11 @@ def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: if encoding in FIXERS: obj = FIXERS[encoding](obj) else: - raise ValueError(f"Unknown function to apply: {encoding}") + msg = f"Unknown function to apply: {encoding}" + raise ValueError(msg) else: - raise ValueError(f"Unknown plan step: {operation}") + msg = f"Unknown plan step: {operation}" + raise ValueError(msg) return obj diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index 57807cc..eaac3c1 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -166,17 +166,14 @@ def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tup if len(input) > 1: # Decode the two-byte sequence 0xc0 0x80. return "\u0000", 2 - else: - if final: - # We hit the end of the stream. Let the superclass method - # handle it. - return sup(input, errors, True) - else: - # Wait to see another byte. - return "", 0 - else: - # Decode a possible six-byte sequence starting with 0xed. - return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) + if final: + # We hit the end of the stream. Let the superclass method + # handle it. + return sup(input, errors, True) + # Wait to see another byte. + return "", 0 + # Decode a possible six-byte sequence starting with 0xed. + return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) @staticmethod def _buffer_decode_surrogates( @@ -205,28 +202,25 @@ def _buffer_decode_surrogates( # handle it as normal UTF-8. It might be a Hangul character # or an error. return sup(input, errors, final) - else: - # We found a surrogate, the stream isn't over yet, and we don't - # know enough of the following bytes to decode anything, so - # consume zero bytes and wait. - return "", 0 - else: - if CESU8_RE.match(input): - # Given this is a CESU-8 sequence, do some math to pull out - # the intended 20-bit value, and consume six bytes. - codepoint = ( - ((input[1] & 0x0F) << 16) - + ((input[2] & 0x3F) << 10) - + ((input[4] & 0x0F) << 6) - + (input[5] & 0x3F) - + 0x10000 - ) - return chr(codepoint), 6 - else: - # This looked like a CESU-8 sequence, but it wasn't one. - # 0xed indicates the start of a three-byte sequence, so give - # three bytes to the superclass to decode as usual. - return sup(input[:3], errors, False) + # We found a surrogate, the stream isn't over yet, and we don't + # know enough of the following bytes to decode anything, so + # consume zero bytes and wait. + return "", 0 + if CESU8_RE.match(input): + # Given this is a CESU-8 sequence, do some math to pull out + # the intended 20-bit value, and consume six bytes. + codepoint = ( + ((input[1] & 0x0F) << 16) + + ((input[2] & 0x3F) << 10) + + ((input[4] & 0x0F) << 6) + + (input[5] & 0x3F) + + 0x10000 + ) + return chr(codepoint), 6 + # This looked like a CESU-8 sequence, but it wasn't one. + # 0xed indicates the start of a three-byte sequence, so give + # three bytes to the superclass to decode as usual. + return sup(input[:3], errors, False) # The encoder is identical to UTF-8. diff --git a/ftfy/cli.py b/ftfy/cli.py index 2807a86..16f3296 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -4,6 +4,7 @@ import os import sys +from pathlib import Path from typing import Union from ftfy import TextFixerConfig, __version__, fix_file @@ -101,7 +102,7 @@ def main() -> None: # whatever encoding is necessary. file = sys.stdin.buffer else: - file = open(args.filename, "rb") + file = Path(args.filename).open("rb") if args.output == "-": outfile = sys.stdout @@ -109,17 +110,14 @@ def main() -> None: if os.path.realpath(args.output) == os.path.realpath(args.filename): sys.stderr.write(SAME_FILE_ERROR_TEXT) sys.exit(1) - outfile = open(args.output, "w", encoding="utf-8") + outfile = Path(args.output).open("w", encoding="utf-8") normalization = args.normalization if normalization.lower() == "none": normalization = None unescape_html: Union[str, bool] - if args.preserve_entities: - unescape_html = False - else: - unescape_html = "auto" + unescape_html = False if args.preserve_entities else "auto" config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization) diff --git a/ftfy/formatting.py b/ftfy/formatting.py index 18df64b..4295558 100644 --- a/ftfy/formatting.py +++ b/ftfy/formatting.py @@ -99,7 +99,8 @@ def display_ljust(text: str, width: int, fillchar: str = " ") -> str: correct if you're viewing this code or documentation in a Web browser. """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: @@ -129,7 +130,8 @@ def display_rjust(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒▒▒▒▒ちゃぶ台返し """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: @@ -154,7 +156,8 @@ def display_center(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒ちゃぶ台返し▒▒▒▒ """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: diff --git a/pyproject.toml b/pyproject.toml index 7b65007..13aeee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,12 +40,13 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP", "RUF"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH"] ignore = [ "ANN101", "ANN401", "RUF001", # complains about Unicode characters that belong in my docstrings "RUF002", # complains about Unicode characters that belong in my docstrings + "PIE808", # explicitly starting ranges at 0 sometimes helps with readability ] [tool.ruff.lint.per-file-ignores] diff --git a/scripts/char_data_table.py b/scripts/char_data_table.py index 78a5957..d063d1a 100644 --- a/scripts/char_data_table.py +++ b/scripts/char_data_table.py @@ -17,8 +17,7 @@ class CharData: def sort_key(self) -> tuple[int, str, int]: if self.name.startswith("LATIN "): return (0, self.name, self.codept) - else: - return (1, "", self.codept) + return (1, "", self.codept) SAFE_ENCODINGS = [ @@ -56,10 +55,7 @@ def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None if byte_min <= byte <= byte_max: info_str = f"{encoding}:{byte:X}" encoding_info.append(info_str) - if encoding_info: - encoding_explanation = encoding_info[0] - else: - encoding_explanation = "???" + encoding_explanation = encoding_info[0] if encoding_info else "???" print(f' "\\N{{{cd.name}}}" # {encoding_explanation}') diff --git a/tests/test_cli.py b/tests/test_cli.py index 0b3d107..a862e31 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,11 +1,12 @@ import os import subprocess +from pathlib import Path import pytest # Get the filename of 'face.txt', an example of mojibake -THIS_DIR = os.path.dirname(__file__) -TEST_FILENAME = os.path.join(THIS_DIR, "face.txt") +THIS_DIR = Path(__file__).parent +TEST_FILENAME = THIS_DIR / "face.txt" CORRECT_OUTPUT = os.linesep.join(["┒(⌣˛⌣)┎", ""]) FAILED_OUTPUT = os.linesep.join( [ @@ -61,6 +62,6 @@ def test_same_file(): def test_stdin(): - with open(TEST_FILENAME, "rb") as infile: + with TEST_FILENAME.open("rb") as infile: output = get_command_output(["ftfy"], stdin=infile) assert output == CORRECT_OUTPUT diff --git a/tests/test_examples_in_json.py b/tests/test_examples_in_json.py index cf99e27..83dcb8e 100644 --- a/tests/test_examples_in_json.py +++ b/tests/test_examples_in_json.py @@ -25,15 +25,15 @@ """ import json -import os +from pathlib import Path import pytest from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text -THIS_DIR = os.path.dirname(__file__) -TEST_FILENAME = os.path.join(THIS_DIR, "test_cases.json") -TEST_DATA = json.load(open(TEST_FILENAME, encoding="utf-8")) +THIS_DIR = Path(__file__).parent +TEST_FILENAME = THIS_DIR / "test_cases.json" +TEST_DATA = json.load(TEST_FILENAME.open(encoding="utf-8")) TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"] From 8a55920c5eb1ea00c80ffd8ce107015ed1c0a80e Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Wed, 30 Oct 2024 16:43:44 -0400 Subject: [PATCH 4/6] reorganize test cases --- tests/test-cases/in-the-wild.json | 451 +++++++++++ tests/test-cases/known-failures.json | 70 ++ tests/test-cases/language-names.json | 127 +++ tests/test-cases/negative.json | 216 ++++++ tests/test-cases/synthetic.json | 208 +++++ tests/test_cases.json | 1061 -------------------------- 6 files changed, 1072 insertions(+), 1061 deletions(-) create mode 100644 tests/test-cases/in-the-wild.json create mode 100644 tests/test-cases/known-failures.json create mode 100644 tests/test-cases/language-names.json create mode 100644 tests/test-cases/negative.json create mode 100644 tests/test-cases/synthetic.json delete mode 100644 tests/test_cases.json diff --git a/tests/test-cases/in-the-wild.json b/tests/test-cases/in-the-wild.json new file mode 100644 index 0000000..b40c838 --- /dev/null +++ b/tests/test-cases/in-the-wild.json @@ -0,0 +1,451 @@ +[ + { + "label": "Low-codepoint emoji", + "comment": "From the ancient era before widespread emoji support on Twitter", + "original": "He's Justinâ\u009d¤", + "fixed": "He's Justin❤", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mix-up about smurfs", + "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", + "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", + "expect": "pass" + }, + { + "label": "Checkmark that almost looks okay as mojibake", + "original": "✔ No problems", + "fixed": "✔ No problems", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 Russian mixup about futbol", + "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", + "fixed": "дороге Из-под #футбол", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in German", + "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", + "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", + "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup of the replacement character", + "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "expect": "pass" + }, + { + "label": "CESU-8 / Windows-1252 emoji", + "original": "Hi guys í ½í¸\u008d", + "fixed": "Hi guys 😍", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 emoji", + "original": "hihi RT username: â\u0098ºí ½í¸\u0098", + "fixed": "hihi RT username: ☺😘", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Turkish", + "original": "Beta Haber: Hırsızı Büyü Korkuttu", + "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", + "original": "İstanbul", + "fixed": "İstanbul", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", + "original": "RUF MICH ZURÜCK", + "fixed": "RUF MICH ZURÜCK", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", + "original": "RÄ«ga", + "fixed": "Rīga", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixed up twice in Russian", + "original": "приятности. РІСњВ¤", + "fixed": "приятности. ❤", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in Malay", + "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", + "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", + "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", + "original": "Iggy Pop (né Jim Osterberg)", + "fixed": "Iggy Pop (né Jim Osterberg)", + "expect": "pass" + }, + { + "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", + "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", + "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", + "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", + "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", + "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up three times", + "original": "The Mona Lisa doesn’t have eyebrows.", + "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", + "fixed": "The Mona Lisa doesn't have eyebrows.", + "expect": "pass" + }, + { + "label": "UTF-8 / Codepage 437 mixup in Russian", + "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", + "fixed": "#правильноепитание", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in French", + "original": "Hôtel de Police", + "fixed": "Hôtel de Police", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in French", + "original": "Liège Avenue de l'HĂ´pital", + "fixed": "Liège Avenue de l'Hôpital", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Vietnamese", + "original": "Tại sao giá hạt sầu riêng lại lên giá?", + "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", + "expect": "pass" + }, + { + "label": "Science! Mid-word Greek letter gets fixed correctly", + "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "expect": "pass" + }, + { + "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", + "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", + "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", + "expect": "pass" + }, + { + "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", + "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", + "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in English", + "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", + "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "UTF-8 / ISO-8859-2 mixup in Czech", + "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", + "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", + "fixed": "Mám dost třetího tisíciletí", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", + "comment": "A difficult test case that can depend on the order that steps are applied", + "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", + "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", + "fixed": "vedere înceţoşată", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in Slovak", + "original": "NapĂ\u00adšte nám !", + "fixed": "Napíšte nám !", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Spanish", + "original": "DOS AÑOS", + "fixed": "DOS AÑOS", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", + "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "expect": "pass" + }, + { + "label": "fancy Unicode crossing-out, but mojibaked", + "original": "hotel $49 $̶6̶3̶ updated 2018", + "fixed": "hotel $49 $̶6̶3̶ updated 2018", + "expect": "pass" + }, + { + "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", + "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", + "fixed": "┒(⌣˛⌣)┎", + "expect": "pass" + }, + { + "label": "We can mostly decode the face above when we lose the character U+009D", + "original": "ââ€�’(⌣˛⌣)ââ€�Ž", + "fixed": "�(⌣˛⌣)�", + "expect": "pass" + }, + { + "label": "Lossy decoding can have plain ASCII question marks, as well", + "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", + "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", + "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 mixup over several emoji", + "comment": "You tried", + "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", + "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", + "expect": "pass" + }, + { + "label": "An absolutely hopeless garble", + "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", + "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent mojibake in Portuguese", + "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "expect": "pass" + }, + { + "label": "Handle Afrikaans 'n character", + "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", + "expect": "pass" + }, + { + "label": "Handle Croatian single-codepoint digraphs", + "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "expect": "pass" + }, + { + "label": "A with an acute accent, in isolation", + "original": "Nicolás", + "fixed": "Nicolás", + "expect": "pass" + }, + { + "label": "sharp S, in isolation, via MacRoman encoding", + "comment": "regression reported in issue #186", + "original": "wei√ü", + "fixed": "weiß", + "expect": "pass" + }, + { + "label": "French example containing non-breaking spaces", + "original": "ART TRIP Ã\u00a0 l'office de tourisme", + "fixed": "ART TRIP à l'office de tourisme", + "expect": "pass" + }, + { + "label": "English example in UTF-8 / Windows-1251 with a ligature", + "original": "This is signiп¬Ѓcantly lower than the respective share", + "fixed-encoding": "This is significantly lower than the respective share", + "fixed": "This is significantly lower than the respective share", + "expect": "pass" + }, + { + "label": "'à' remains its own word, even if spaces after it get coalesced into one", + "original": "à perturber la réflexion des théologiens jusqu'à nos jours", + "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", + "expect": "pass" + }, + { + "label": "Fix 'à' in inconsistent mojibake", + "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", + "expect": "pass" + }, + { + "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", + "original": "com especial atenção à s crianças", + "fixed": "com especial atenção às crianças", + "expect": "pass" + }, + { + "label": "This is why we require a space after the 's' in 'às'", + "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "expect": "pass" + }, + { + "label": "We can fix 'à' in windows-1251 sometimes as well", + "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", + "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", + "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", + "expect": "pass" + }, + { + "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", + "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", + "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", + "expect": "pass" + }, + { + "label": "A complex, lossy pile-up of mojibake in Portuguese", + "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", + "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", + "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", + "fixed": "Cànan nan Gàidheal", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixup in tweet spam", + "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixup", + "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", + "expect": "pass" + }, + { + "label": "Dutch example with ë", + "comment": "from issue reported by MicroJackson", + "original": "ongeëvenaard", + "fixed-encoding": "ongeëvenaard", + "fixed": "ongeëvenaard", + "expect": "pass" + }, + { + "label": "HTML entity on top of UTF-8 / Latin-1", + "original": "10μs", + "fixed-encoding": "10μs", + "fixed": "10μs", + "expect": "pass" + }, + { + "label": "Three layers of UTF-8 / MacRoman mixup in French", + "comment": "You're welcome", + "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", + "fixed": "Merci de télécharger le plug-in Flash Player 8", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mixup in French", + "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", + "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", + "expect": "pass" + }, + { + "label": "Italian UTF-8 / MacRoman example with ò", + "original": "Le Vigne di Zam√≤", + "fixed": "Le Vigne di Zamò", + "expect": "pass" + }, + { + "label": "Punctuation pile-up should actually be musical notes", + "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "expect": "pass" + }, + { + "label": "Latvian UTF-8 / Windows-1257 mojibake", + "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", + "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", + "expect": "pass" + }, + { + "label": "Latvian UTF-8 / MacRoman mojibake", + "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", + "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1257 mojibake", + "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", + "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1250 mojibake", + "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", + "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", + "expect": "pass" + }, + { + "label": "Hebrew UTF-8 / Windows-1252 mojibake", + "comment": "reported by SuperIRabbit as issue #158", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Wide comma in UTF-8 / Windows-1252", + "original": "Ningbo,China", + "fixed-encoding": "Ningbo,China", + "fixed": "Ningbo,China", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/known-failures.json b/tests/test-cases/known-failures.json new file mode 100644 index 0000000..2663d9f --- /dev/null +++ b/tests/test-cases/known-failures.json @@ -0,0 +1,70 @@ +[ + { + "label": "Misleading mix-up in Spanish", + "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", + "original": "tiene demora y está \u0093próximo a resolverse\u0094", + "fixed": "tiene demora y está \"próximo a resolverse\"", + "expect": "fail" + }, + { + "label": "Two levels of inconsistent mojibake", + "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", + "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", + "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", + "expect": "fail" + }, + { + "label": "A-with-grave in Vietnamese", + "comment": "Currently adds extra spaces that shouldn't be there", + "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", + "fixed": "Xem clip hài, phim hài mới hay nhất", + "expect": "fail" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish", + "comment": "Requires something like encoding detection", + "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", + "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", + "expect": "fail" + }, + { + "label": "subtle UTF-8 / codepage 437 mixup in Spanish", + "original": "┬┐que diferencia hay?", + "fixed": "¿que diferencia hay?", + "expect": "fail" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", + "comment": "Requires something like encoding detection", + "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", + "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", + "expect": "fail" + }, + { + "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", + "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", + "fixed": "faites attention à bien vous renseigner avant sur le médicament", + "expect": "fail" + }, + { + "label": "Italian UTF-8 / MacRoman mojibake that looks like math", + "comment": "False negative: 'pi√π' is a bit too reasonable to fix", + "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", + "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", + "expect": "fail" + }, + { + "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", + "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", + "original": "أكثر من Ù Ù Ù¡ بلد", + "fixed": "أكثر من ٠٠١ بلد", + "expect": "fail" + }, + { + "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", + "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", + "original": "MISUTÂ\u00a0AJIKKO", + "fixed": "MISUTÂ\u00a0AJIKKO", + "expect": "fail" + } +] \ No newline at end of file diff --git a/tests/test-cases/language-names.json b/tests/test-cases/language-names.json new file mode 100644 index 0000000..cdb8241 --- /dev/null +++ b/tests/test-cases/language-names.json @@ -0,0 +1,127 @@ +[ + { + "label": "Messy language names: Czech", + "comment": "This and several following examples came from the same language selector", + "original": "ÄŒeÅ¡tina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Messy language names: Gaelic", + "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", + "original": "GÃ\u00a0idhlig", + "fixed": "Gàidhlig", + "expect": "pass" + }, + { + "label": "Messy language names: Lithuanian", + "original": "Lietuvių", + "fixed": "Lietuvių", + "expect": "pass" + }, + { + "label": "Messy language names: Slovak", + "original": "SlovenÄ�ina", + "fixed": "Sloven�ina", + "expect": "pass" + }, + { + "label": "Messy language names: Vietnamese", + "original": "Tiếng Việt", + "fixed": "Tiếng Việt", + "expect": "pass" + }, + { + "label": "Messy language names: Greek", + "original": "Ελληνικά", + "fixed": "Ελληνικά", + "expect": "pass" + }, + { + "label": "Messy language names: Bulgarian", + "original": "българÑ�ки език", + "fixed": "българ�ки език", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Serbian [Cyrillic]", + "original": "CрпÑ�ки [ћирилицом]", + "fixed": "Cрп�ки [ћирилицом]", + "expect": "pass" + }, + { + "label": "Messy language names: Hebrew", + "original": "עברית", + "fixed": "עברית", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Hindi", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "हिनà¥�दी", + "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", + "expect": "pass" + }, + { + "label": "Messy language names: Tamil", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "தமிழà¯�", + "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", + "expect": "pass" + }, + { + "label": "Messy language names: Thai", + "original": "ภาษาไทย", + "fixed": "ภาษาไทย", + "expect": "pass" + }, + { + "label": "Messy language names: Simplified Chinese", + "original": "简体ä¸\u00adæ–‡", + "fixed": "简体中文", + "expect": "pass" + }, + { + "label": "Messy language names: Traditional Chinese", + "original": "æ\u00ad£é«”ä¸\u00adæ–‡", + "fixed": "正體中文", + "expect": "pass" + }, + { + "label": "Messy language names: Japanese", + "original": "日本語", + "fixed": "日本語", + "expect": "pass" + }, + { + "label": "Messy language names: Korean", + "original": "한êµ\u00adì–´", + "fixed": "한국어", + "expect": "pass" + }, + { + "label": "Messy language name in cp437: Czech", + "comment": "A synthetic example, I suppose, but goes with the other language name tests", + "original": "─îe┼ítina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Messy language name in cp437: Vietnamese", + "original": "Tiß║┐ng Viß╗çt", + "fixed": "Tiếng Việt", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/negative.json b/tests/test-cases/negative.json new file mode 100644 index 0000000..dc1e36b --- /dev/null +++ b/tests/test-cases/negative.json @@ -0,0 +1,216 @@ +[ + { + "label": "Negative: Using diaereses as quotation marks in Greek", + "comment": "Examples in this file might be detected as mojibake-like, but should not be changed", + "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "expect": "pass" + }, + { + "label": "Negative: Don't fix a multiplication symbol in quotes", + "original": "higher values (“+” and “×” curves) in the superficial region", + "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", + "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", + "expect": "pass" + }, + { + "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", + "comment": "issue #202", + "original": "Bremer/Mccoy – DrÃ¥ber", + "fixed": "Bremer/Mccoy – DrÃ¥ber", + "expect": "pass" + }, + { + "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", + "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "expect": "pass" + }, + { + "label": "Negative: multiplication sign and ellipsis", + "comment": "Should not turn into a dot below", + "original": "4288×…", + "fixed": "4288×…", + "expect": "pass" + }, + { + "label": "Negative: accents are sometimes used as quotes", + "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", + "original": "``toda produzida pronta pra assa aí´´", + "fixed": "``toda produzida pronta pra assa aí´´", + "expect": "pass" + }, + { + "label": "Negative: 'Õ' followed by an ellipsis", + "comment": "Should not turn into the Armenian letter Յ", + "original": "HUHLL Õ…", + "fixed": "HUHLL Õ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ê' followed by an ellipsis", + "comment": "Should not turn into a squat reversed esh", + "original": "RETWEET SE VOCÊ…", + "fixed": "RETWEET SE VOCÊ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by an ellipsis", + "comment": "Should not turn into 'MARQUɅ'", + "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ó' followed by an ellipsis", + "comment": "Should not turn into 'SӅ'", + "original": "TEM QUE SEGUIR, SDV SÓ…", + "fixed": "TEM QUE SEGUIR, SDV SÓ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by a curly apostrophe", + "comment": "Should not turn into 'ZZAJɒs'", + "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", + "expect": "pass" + }, + { + "label": "Negative: 'é' preceded by curly apostrophe", + "comment": "Should not turn into 'LՎpisode'", + "original": "L’épisode 8 est trop fou ouahh", + "fixed-encoding": "L’épisode 8 est trop fou ouahh", + "fixed": "L'épisode 8 est trop fou ouahh", + "expect": "pass" + }, + { + "label": "Negative: three raised eyebrows or something?", + "comment": "Should not turn into private use character U+F659", + "original": "Ôôô VIDA MINHA", + "fixed": "Ôôô VIDA MINHA", + "expect": "pass" + }, + { + "label": "Negative: copyright sign preceded by non-breaking space", + "comment": "Should not turn into 'ʩ'", + "original": "[x]\u00a0©", + "fixed": "[x]\u00a0©", + "expect": "pass" + }, + { + "label": "Negative: en dash and infinity sign", + "comment": "Should not turn into '2012Ѱ'", + "original": "2012—∞", + "fixed": "2012—∞", + "expect": "pass" + }, + { + "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", + "original": "SENSЕ - Oleg Tsedryk", + "fixed": "SENSЕ - Oleg Tsedryk", + "expect": "pass" + }, + { + "label": "Negative: angry face", + "comment": "The face should not turn into '`«'", + "original": "OK??:( `¬´ ):", + "fixed": "OK??:( `¬´ ):", + "expect": "pass" + }, + { + "label": "Negative, synthetic: face with glasses and a raised eyebrow", + "original": "( o¬ô )", + "fixed": "( o¬ô )", + "expect": "pass" + }, + { + "label": "Negative: triangle and degree sign", + "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", + "original": "∆°", + "fixed": "∆°", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with inverted question mark", + "comment": "Former false positive - it should not turn into 'QUEM ɿ'", + "original": "ESSE CARA AI QUEM É¿", + "fixed": "ESSE CARA AI QUEM É¿", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with acute accents as quotation marks", + "comment": "Former false positive - the end should not turn into a superscript H", + "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "expect": "pass" + }, + { + "label": "Negative: Finnish Ä followed by a non-breaking space", + "comment": "Former false positive - should not become a G with a dot", + "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "expect": "pass" + }, + { + "label": "Negative: multiplying by currency", + "comment": "Former false positive - should not become the Hebrew letter 'final pe'", + "original": "Offering 5×£35 pin ups", + "fixed": "Offering 5×£35 pin ups", + "expect": "pass" + }, + { + "label": "Negative: registered chocolate brand name", + "comment": "Former false positive - should not become the IPA letter 'lezh'", + "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "expect": "pass" + }, + { + "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", + "comment": "Should not become a cedilla", + "original": "Connect with Āø on Facebook", + "fixed": "Connect with Āø on Facebook", + "expect": "pass" + }, + { + "label": "Mostly negative: we only need to fix C1 control characters", + "comment": "We should not decode 'é\u0085 ' as '酠'", + "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", + "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", + "expect": "pass" + }, + { + "label": "Negative: We don't fix à in all contexts", + "original": "C O N C L U S à O", + "fixed": "C O N C L U S à O", + "expect": "pass" + }, + { + "label": "Negative: Two concatenated strings", + "comment": "Should not turn into 'fratarak᧠141'", + "original": "Oborzos, per. Vahbarz, frataraká§ 141", + "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", + "expect": "pass" + }, + { + "label": "Negative: Indonesian leetspeak", + "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "expect": "pass" + }, + { + "label": "Negative: math in Unicode", + "comment": "This isn't mojibake, it's an actual equation", + "original": "(-1/2)! = √π", + "fixed": "(-1/2)! = √π", + "expect": "pass" + }, + { + "label": "Negative: Leet line-art", + "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", + "original": "├┤a┼┐a┼┐a┼┐a┼┐a", + "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/synthetic.json b/tests/test-cases/synthetic.json new file mode 100644 index 0000000..a939311 --- /dev/null +++ b/tests/test-cases/synthetic.json @@ -0,0 +1,208 @@ +[ + { + "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", + "comment": "Examples in this file were made up to test something, instead of found in the wild", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", + "original": "◊ë◊î◊ï◊ì◊¢◊î", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", + "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", + "original": "×\u0090×\u0091×\u0091×\u0090", + "fixed": "אבבא", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", + "original": "ÿ±ÿ≥ÿߟÑÿ©", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", + "comment": "The original example of why ftfy needs heuristics", + "original": "I'm not such a fan of Charlotte Brontë…”", + "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", + "fixed": "I'm not such a fan of Charlotte Brontë…\"", + "expect": "pass" + }, + { + "label": "Synthetic, negative: hypothetical Swedish product name", + "comment": "This used to be a constructed example of a false positive, until you added another symbol", + "original": "AHÅ™, the new sofa from IKEA", + "fixed": "AHÅ™, the new sofa from IKEA", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Ukrainian capital letters", + "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", + "original": "ВІКІ is Ukrainian for WIKI", + "fixed": "ВІКІ is Ukrainian for WIKI", + "expect": "pass" + }, + { + "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "These control characters \u001a are apparently intentional \u0081", + "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", + "fixed": "These control characters are apparently intentional \u0081", + "expect": "pass" + }, + { + "label": "Synthetic, negative: U+1A on its own", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "Here's a control character: \u001a", + "fixed-encoding": "Here's a control character: \u001a", + "fixed": "Here's a control character: ", + "expect": "pass" + }, + { + "label": "Synthetic, negative: A-with-circle as an Angstrom sign", + "comment": "Should not turn into '10 ŗ'", + "original": "a radius of 10 Å—", + "fixed": "a radius of 10 Å—", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", + "original": "!YO SÉ¡", + "fixed": "!YO SÉ¡", + "expect": "pass" + }, + { + "label": "Synthetic: fix text with backslashes in it", + "comment": "Tests for a regression on a long-ago bug", + "original": "<40\\% vs \u00e2\u0089\u00a540\\%", + "fixed": "<40\\% vs ≥40\\%", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", + "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", + "original": "“mismatched quotes…”", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: lossy decoding in sloppy-windows-1252", + "original": "“lossy decodingâ€�", + "fixed-encoding": "“lossy decoding�", + "fixed": "\"lossy decoding�", + "expect": "pass" + }, + { + "label": "Synthetic: French word for August in windows-1252", + "original": "août", + "fixed-encoding": "août", + "fixed": "août", + "expect": "pass" + }, + { + "label": "Synthetic: French word for hotel in all-caps windows-1252", + "original": "HÔTEL", + "fixed-encoding": "HÔTEL", + "fixed": "HÔTEL", + "expect": "pass" + }, + { + "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", + "original": "CÙIS", + "fixed-encoding": "CÙIS", + "fixed": "CÙIS", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a non-breaking space", + "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", + "original": "NICIODATĂ\u00a0", + "fixed": "NICIODATĂ\u00a0", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Be careful around curly apostrophes", + "comment": "It shouldn't end up saying 'a lot of Òs'", + "original": "There are a lot of Ã’s in mojibake text", + "fixed-encoding": "There are a lot of Ã’s in mojibake text", + "fixed": "There are a lot of Ã's in mojibake text", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a trademark sign", + "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", + "original": "NICIODATĂ™", + "fixed": "NICIODATĂ™", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Lithuanian word before a trademark sign", + "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", + "original": "TRANSFORMATORIŲ™", + "fixed": "TRANSFORMATORIŲ™", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Norwegian capitalized nonsense", + "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", + "original": "HÅØYA ER BLÅØYD", + "fixed": "HÅØYA ER BLÅØYD", + "expect": "pass" + }, + { + "label": "Synthetic, negative: raised eyebrow kaomoji", + "original": "Ō¬o", + "fixed": "Ō¬o", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", + "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", + "original": "ПоздравЂаво", + "fixed": "ПоздравЂаво", + "expect": "pass" + }, + { + "label": "Synthetic: mojibake with trademark sign at the end of a word", + "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", + "original": "OÙ ET QUAND?", + "fixed": "OÙ ET QUAND?", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test_cases.json b/tests/test_cases.json deleted file mode 100644 index 005dab2..0000000 --- a/tests/test_cases.json +++ /dev/null @@ -1,1061 +0,0 @@ -[ - { - "label": "Messy language names: Czech", - "comment": "This and several following examples came from the same language selector", - "original": "ÄŒeÅ¡tina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Messy language names: Gaelic", - "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", - "original": "GÃ\u00a0idhlig", - "fixed": "Gàidhlig", - "expect": "pass" - }, - { - "label": "Messy language names: Lithuanian", - "original": "Lietuvių", - "fixed": "Lietuvių", - "expect": "pass" - }, - { - "label": "Messy language names: Slovak", - "original": "SlovenÄ�ina", - "fixed": "Sloven�ina", - "expect": "pass" - }, - { - "label": "Messy language names: Vietnamese", - "original": "Tiếng Việt", - "fixed": "Tiếng Việt", - "expect": "pass" - }, - { - "label": "Messy language names: Greek", - "original": "Ελληνικά", - "fixed": "Ελληνικά", - "expect": "pass" - }, - { - "label": "Messy language names: Bulgarian", - "original": "българÑ�ки език", - "fixed": "българ�ки език", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Serbian [Cyrillic]", - "original": "CрпÑ�ки [ћирилицом]", - "fixed": "Cрп�ки [ћирилицом]", - "expect": "pass" - }, - { - "label": "Messy language names: Hebrew", - "original": "עברית", - "fixed": "עברית", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Hindi", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "हिनà¥�दी", - "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", - "expect": "pass" - }, - { - "label": "Messy language names: Tamil", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "தமிழà¯�", - "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", - "expect": "pass" - }, - { - "label": "Messy language names: Thai", - "original": "ภาษาไทย", - "fixed": "ภาษาไทย", - "expect": "pass" - }, - { - "label": "Messy language names: Simplified Chinese", - "original": "简体ä¸\u00adæ–‡", - "fixed": "简体中文", - "expect": "pass" - }, - { - "label": "Messy language names: Traditional Chinese", - "original": "æ\u00ad£é«”ä¸\u00adæ–‡", - "fixed": "正體中文", - "expect": "pass" - }, - { - "label": "Messy language names: Japanese", - "original": "日本語", - "fixed": "日本語", - "expect": "pass" - }, - { - "label": "Messy language names: Korean", - "original": "한êµ\u00adì–´", - "fixed": "한국어", - "expect": "pass" - }, - { - "label": "Synthetic: Messy language name in cp437: Czech", - "original": "─îe┼ítina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Synthetic: Messy language name in cp437: Vietnamese", - "original": "Tiß║┐ng Viß╗çt", - "fixed": "Tiếng Việt", - "expect": "pass" - }, - { - "label": "Low-codepoint emoji", - "comment": "From the ancient era before widespread emoji support on Twitter", - "original": "He's Justinâ\u009d¤", - "fixed": "He's Justin❤", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mix-up about smurfs", - "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", - "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", - "expect": "pass" - }, - { - "label": "Checkmark that almost looks okay as mojibake", - "original": "✔ No problems", - "fixed": "✔ No problems", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 Russian mixup about futbol", - "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", - "fixed": "дороге Из-под #футбол", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German", - "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", - "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", - "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup of the replacement character", - "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "expect": "pass" - }, - { - "label": "CESU-8 / Windows-1252 emoji", - "original": "Hi guys í ½í¸\u008d", - "fixed": "Hi guys 😍", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 emoji", - "original": "hihi RT username: â\u0098ºí ½í¸\u0098", - "fixed": "hihi RT username: ☺😘", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Turkish", - "original": "Beta Haber: Hırsızı Büyü Korkuttu", - "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", - "original": "İstanbul", - "fixed": "İstanbul", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", - "original": "RUF MICH ZURÜCK", - "fixed": "RUF MICH ZURÜCK", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", - "original": "RÄ«ga", - "fixed": "Rīga", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixed up twice in Russian", - "original": "приятности. РІСњВ¤", - "fixed": "приятности. ❤", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in Malay", - "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", - "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", - "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", - "original": "Iggy Pop (né Jim Osterberg)", - "fixed": "Iggy Pop (né Jim Osterberg)", - "expect": "pass" - }, - { - "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", - "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", - "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", - "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", - "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", - "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up three times", - "original": "The Mona Lisa doesn’t have eyebrows.", - "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", - "fixed": "The Mona Lisa doesn't have eyebrows.", - "expect": "pass" - }, - { - "label": "UTF-8 / Codepage 437 mixup in Russian", - "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", - "fixed": "#правильноепитание", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in French", - "original": "Hôtel de Police", - "fixed": "Hôtel de Police", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in French", - "original": "Liège Avenue de l'HĂ´pital", - "fixed": "Liège Avenue de l'Hôpital", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Vietnamese", - "original": "Tại sao giá hạt sầu riêng lại lên giá?", - "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", - "expect": "pass" - }, - { - "label": "Negative: using diaereses as quotation marks in Greek", - "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "expect": "pass" - }, - { - "label": "Science! Mid-word Greek letter gets fixed correctly", - "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "expect": "pass" - }, - { - "label": "Negative: More science! Don't fix a multiplication symbol in quotes", - "original": "higher values (“+” and “×” curves) in the superficial region", - "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", - "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", - "expect": "pass" - }, - { - "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", - "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", - "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", - "expect": "pass" - }, - { - "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", - "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", - "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in English", - "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", - "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "UTF-8 / ISO-8859-2 mixup in Czech", - "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", - "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", - "fixed": "Mám dost třetího tisíciletí", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", - "comment": "A difficult test case that can depend on the order that steps are applied", - "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "expect": "pass" - }, - { - "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", - "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", - "original": "أكثر من Ù Ù Ù¡ بلد", - "fixed": "أكثر من ٠٠١ بلد", - "expect": "fail" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", - "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", - "fixed": "vedere înceţoşată", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in Slovak", - "original": "NapĂ\u00adšte nám !", - "fixed": "Napíšte nám !", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Spanish", - "original": "DOS AÑOS", - "fixed": "DOS AÑOS", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", - "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "expect": "pass" - }, - { - "label": "fancy Unicode crossing-out, but mojibaked", - "original": "hotel $49 $̶6̶3̶ updated 2018", - "fixed": "hotel $49 $̶6̶3̶ updated 2018", - "expect": "pass" - }, - { - "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", - "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", - "fixed": "┒(⌣˛⌣)┎", - "expect": "pass" - }, - { - "label": "We can mostly decode the face above when we lose the character U+009D", - "original": "ââ€�’(⌣˛⌣)ââ€�Ž", - "fixed": "�(⌣˛⌣)�", - "expect": "pass" - }, - { - "label": "Lossy decoding can have plain ASCII question marks, as well", - "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", - "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", - "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 mixup over several emoji", - "comment": "You tried", - "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", - "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", - "expect": "pass" - }, - { - "label": "Two levels of inconsistent mojibake", - "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", - "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", - "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", - "expect": "fail" - }, - { - "label": "An absolutely hopeless garble", - "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", - "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent mojibake in Portuguese", - "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "expect": "pass" - }, - { - "label": "Handle Afrikaans 'n character", - "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", - "expect": "pass" - }, - { - "label": "Handle Croatian single-codepoint digraphs", - "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "expect": "pass" - }, - { - "label": "A with an acute accent, in isolation", - "original": "Nicolás", - "fixed": "Nicolás", - "expect": "pass" - }, - { - "label": "sharp S, in isolation, via MacRoman encoding", - "comment": "regression reported in issue #186", - "original": "wei√ü", - "fixed": "weiß", - "expect": "pass" - }, - { - "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", - "comment": "issue #202", - "original": "Bremer/Mccoy – DrÃ¥ber", - "fixed": "Bremer/Mccoy – DrÃ¥ber", - "expect": "pass" - }, - { - "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", - "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "expect": "pass" - }, - { - "label": "Negative: multiplication sign and ellipsis", - "comment": "Should not turn into a dot below", - "original": "4288×…", - "fixed": "4288×…", - "expect": "pass" - }, - { - "label": "Negative: accents are sometimes used as quotes", - "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", - "original": "``toda produzida pronta pra assa aí´´", - "fixed": "``toda produzida pronta pra assa aí´´", - "expect": "pass" - }, - { - "label": "Negative: 'Õ' followed by an ellipsis", - "comment": "Should not turn into the Armenian letter Յ", - "original": "HUHLL Õ…", - "fixed": "HUHLL Õ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ê' followed by an ellipsis", - "comment": "Should not turn into a squat reversed esh", - "original": "RETWEET SE VOCÊ…", - "fixed": "RETWEET SE VOCÊ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by an ellipsis", - "comment": "Should not turn into 'MARQUɅ'", - "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ó' followed by an ellipsis", - "comment": "Should not turn into 'SӅ'", - "original": "TEM QUE SEGUIR, SDV SÓ…", - "fixed": "TEM QUE SEGUIR, SDV SÓ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by a curly apostrophe", - "comment": "Should not turn into 'ZZAJɒs'", - "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", - "expect": "pass" - }, - { - "label": "Negative: 'é' preceded by curly apostrophe", - "comment": "Should not turn into 'LՎpisode'", - "original": "L’épisode 8 est trop fou ouahh", - "fixed-encoding": "L’épisode 8 est trop fou ouahh", - "fixed": "L'épisode 8 est trop fou ouahh", - "expect": "pass" - }, - { - "label": "Negative: three raised eyebrows or something?", - "comment": "Should not turn into private use character U+F659", - "original": "Ôôô VIDA MINHA", - "fixed": "Ôôô VIDA MINHA", - "expect": "pass" - }, - { - "label": "Negative: copyright sign preceded by non-breaking space", - "comment": "Should not turn into 'ʩ'", - "original": "[x]\u00a0©", - "fixed": "[x]\u00a0©", - "expect": "pass" - }, - { - "label": "Negative: en dash and infinity sign", - "comment": "Should not turn into '2012Ѱ'", - "original": "2012—∞", - "fixed": "2012—∞", - "expect": "pass" - }, - { - "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", - "original": "SENSЕ - Oleg Tsedryk", - "fixed": "SENSЕ - Oleg Tsedryk", - "expect": "pass" - }, - { - "label": "Negative: angry face", - "comment": "The face should not turn into '`«'", - "original": "OK??:( `¬´ ):", - "fixed": "OK??:( `¬´ ):", - "expect": "pass" - }, - { - "label": "Negative, synthetic: face with glasses and a raised eyebrow", - "original": "( o¬ô )", - "fixed": "( o¬ô )", - "expect": "pass" - }, - { - "label": "Negative: triangle and degree sign", - "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", - "original": "∆°", - "fixed": "∆°", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with inverted question mark", - "comment": "Former false positive - it should not turn into 'QUEM ɿ'", - "original": "ESSE CARA AI QUEM É¿", - "fixed": "ESSE CARA AI QUEM É¿", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with acute accents as quotation marks", - "comment": "Former false positive - the end should not turn into a superscript H", - "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "expect": "pass" - }, - { - "label": "Negative: Finnish Ä followed by a non-breaking space", - "comment": "Former false positive - should not become a G with a dot", - "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "expect": "pass" - }, - { - "label": "Negative: multiplying by currency", - "comment": "Former false positive - should not become the Hebrew letter 'final pe'", - "original": "Offering 5×£35 pin ups", - "fixed": "Offering 5×£35 pin ups", - "expect": "pass" - }, - { - "label": "Negative: registered chocolate brand name", - "comment": "Former false positive - should not become the IPA letter 'lezh'", - "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "expect": "pass" - }, - { - "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", - "comment": "Should not become a cedilla", - "original": "Connect with Āø on Facebook", - "fixed": "Connect with Āø on Facebook", - "expect": "pass" - }, - { - "label": "Mostly negative: we only need to fix C1 control characters", - "comment": "We should not decode 'é\u0085 ' as '酠'", - "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", - "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", - "expect": "pass" - }, - { - "label": "French example containing non-breaking spaces", - "original": "ART TRIP Ã\u00a0 l'office de tourisme", - "fixed": "ART TRIP à l'office de tourisme", - "expect": "pass" - }, - { - "label": "English example in UTF-8 / Windows-1251 with a ligature", - "original": "This is signiп¬Ѓcantly lower than the respective share", - "fixed-encoding": "This is significantly lower than the respective share", - "fixed": "This is significantly lower than the respective share", - "expect": "pass" - }, - { - "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Negative: We don't fix à in all contexts", - "original": "C O N C L U S à O", - "fixed": "C O N C L U S à O", - "expect": "pass" - }, - { - "label": "'à' remains its own word, even if spaces after it get coalesced into one", - "original": "à perturber la réflexion des théologiens jusqu'à nos jours", - "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", - "expect": "pass" - }, - { - "label": "Fix 'à' in inconsistent mojibake", - "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", - "expect": "pass" - }, - { - "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", - "original": "com especial atenção à s crianças", - "fixed": "com especial atenção às crianças", - "expect": "pass" - }, - { - "label": "This is why we require a space after the 's' in 'às'", - "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "expect": "pass" - }, - { - "label": "We can fix 'à' in windows-1251 sometimes as well", - "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", - "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", - "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", - "expect": "pass" - }, - { - "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", - "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", - "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", - "expect": "pass" - }, - { - "label": "A complex, lossy pile-up of mojibake in Portuguese", - "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", - "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", - "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", - "fixed": "Cànan nan Gàidheal", - "expect": "pass" - }, - { - "label": "Misleading mix-up in Spanish", - "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", - "original": "tiene demora y está \u0093próximo a resolverse\u0094", - "fixed": "tiene demora y está \"próximo a resolverse\"", - "expect": "fail" - }, - { - "label": "A-with-grave in Vietnamese", - "comment": "Currently adds extra spaces that shouldn't be there", - "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", - "fixed": "Xem clip hài, phim hài mới hay nhất", - "expect": "fail" - }, - { - "label": "Punctuation pile-up should actually be musical notes", - "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "expect": "pass" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish", - "comment": "Requires something like encoding detection", - "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", - "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", - "expect": "fail" - }, - { - "label": "subtle UTF-8 / codepage 437 mixup in Spanish", - "original": "┬┐que diferencia hay?", - "fixed": "¿que diferencia hay?", - "expect": "fail" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", - "comment": "Requires something like encoding detection", - "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", - "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", - "expect": "fail" - }, - { - "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", - "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", - "fixed": "faites attention à bien vous renseigner avant sur le médicament", - "expect": "fail" - }, - { - "label": "UTF-8 / Windows-1251 mixup in tweet spam", - "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixup", - "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", - "expect": "pass" - }, - { - "label": "Dutch example with ë", - "comment": "from issue reported by MicroJackson", - "original": "ongeëvenaard", - "fixed-encoding": "ongeëvenaard", - "fixed": "ongeëvenaard", - "expect": "pass" - }, - { - "label": "HTML entity on top of UTF-8 / Latin-1", - "original": "10μs", - "fixed-encoding": "10μs", - "fixed": "10μs", - "expect": "pass" - }, - { - "label": "Negative: Two concatenated strings", - "comment": "Should not turn into 'fratarak᧠141'", - "original": "Oborzos, per. Vahbarz, frataraká§ 141", - "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", - "expect": "pass" - }, - { - "label": "Negative: Indonesian leetspeak", - "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "expect": "pass" - }, - { - "label": "Three layers of UTF-8 / MacRoman mixup in French", - "comment": "You're welcome", - "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", - "fixed": "Merci de télécharger le plug-in Flash Player 8", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mixup in French", - "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", - "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", - "expect": "pass" - }, - { - "label": "Italian UTF-8 / MacRoman example with ò", - "original": "Le Vigne di Zam√≤", - "fixed": "Le Vigne di Zamò", - "expect": "pass" - }, - { - "label": "Italian UTF-8 / MacRoman mojibake that looks like math", - "comment": "False negative: 'pi√π' is a bit too reasonable to fix", - "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", - "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", - "expect": "fail" - }, - { - "label": "Latvian UTF-8 / Windows-1257 mojibake", - "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", - "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", - "expect": "pass" - }, - { - "label": "Latvian UTF-8 / MacRoman mojibake", - "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", - "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1257 mojibake", - "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", - "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1250 mojibake", - "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", - "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", - "expect": "pass" - }, - { - "label": "Hebrew UTF-8 / Windows-1252 mojibake", - "comment": "reported by SuperIRabbit as issue #158", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Wide comma in UTF-8 / Windows-1252", - "original": "Ningbo,China", - "fixed-encoding": "Ningbo,China", - "fixed": "Ningbo,China", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", - "original": "◊ë◊î◊ï◊ì◊¢◊î", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", - "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", - "original": "×\u0090×\u0091×\u0091×\u0090", - "fixed": "אבבא", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", - "original": "ÿ±ÿ≥ÿߟÑÿ©", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Negative: math in Unicode", - "comment": "This isn't mojibake, it's an actual equation", - "original": "(-1/2)! = √π", - "fixed": "(-1/2)! = √π", - "expect": "pass" - }, - { - "label": "Negative: Leet line-art", - "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", - "original": "├┤a┼┐a┼┐a┼┐a┼┐a", - "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", - "comment": "The original example of why ftfy needs heuristics", - "original": "I'm not such a fan of Charlotte Brontë…”", - "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", - "fixed": "I'm not such a fan of Charlotte Brontë…\"", - "expect": "pass" - }, - { - "label": "Synthetic, negative: hypothetical Swedish product name", - "comment": "This used to be a constructed example of a false positive, until you added another symbol", - "original": "AHÅ™, the new sofa from IKEA", - "fixed": "AHÅ™, the new sofa from IKEA", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Ukrainian capital letters", - "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", - "original": "ВІКІ is Ukrainian for WIKI", - "fixed": "ВІКІ is Ukrainian for WIKI", - "expect": "pass" - }, - { - "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "These control characters \u001a are apparently intentional \u0081", - "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", - "fixed": "These control characters are apparently intentional \u0081", - "expect": "pass" - }, - { - "label": "Synthetic, negative: U+1A on its own", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "Here's a control character: \u001a", - "fixed-encoding": "Here's a control character: \u001a", - "fixed": "Here's a control character: ", - "expect": "pass" - }, - { - "label": "Synthetic, negative: A-with-circle as an Angstrom sign", - "comment": "Should not turn into '10 ŗ'", - "original": "a radius of 10 Å—", - "fixed": "a radius of 10 Å—", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", - "original": "!YO SÉ¡", - "fixed": "!YO SÉ¡", - "expect": "pass" - }, - { - "label": "Synthetic: fix text with backslashes in it", - "comment": "Tests for a regression on a long-ago bug", - "original": "<40\\% vs \u00e2\u0089\u00a540\\%", - "fixed": "<40\\% vs ≥40\\%", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", - "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", - "original": "“mismatched quotes…”", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: lossy decoding in sloppy-windows-1252", - "original": "“lossy decodingâ€�", - "fixed-encoding": "“lossy decoding�", - "fixed": "\"lossy decoding�", - "expect": "pass" - }, - { - "label": "Synthetic: French word for August in windows-1252", - "original": "août", - "fixed-encoding": "août", - "fixed": "août", - "expect": "pass" - }, - { - "label": "Synthetic: French word for hotel in all-caps windows-1252", - "original": "HÔTEL", - "fixed-encoding": "HÔTEL", - "fixed": "HÔTEL", - "expect": "pass" - }, - { - "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", - "original": "CÙIS", - "fixed-encoding": "CÙIS", - "fixed": "CÙIS", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a non-breaking space", - "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", - "original": "NICIODATĂ\u00a0", - "fixed": "NICIODATĂ\u00a0", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Be careful around curly apostrophes", - "comment": "It shouldn't end up saying 'a lot of Òs'", - "original": "There are a lot of Ã’s in mojibake text", - "fixed-encoding": "There are a lot of Ã’s in mojibake text", - "fixed": "There are a lot of Ã's in mojibake text", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a trademark sign", - "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", - "original": "NICIODATĂ™", - "fixed": "NICIODATĂ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Lithuanian word before a trademark sign", - "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", - "original": "TRANSFORMATORIŲ™", - "fixed": "TRANSFORMATORIŲ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Norwegian capitalized sentence", - "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", - "original": "HÅØYA ER BLÅØYD", - "fixed": "HÅØYA ER BLÅØYD", - "expect": "pass" - }, - { - "label": "Synthetic, negative: raised eyebrow kaomoji", - "original": "Ō¬o", - "fixed": "Ō¬o", - "expect": "pass" - }, - { - "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", - "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", - "original": "MISUTÂ\u00a0AJIKKO", - "fixed": "MISUTÂ\u00a0AJIKKO", - "expect": "fail" - }, - { - "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", - "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", - "original": "ПоздравЂаво", - "fixed": "ПоздравЂаво", - "expect": "pass" - }, - { - "label": "Synthetic: mojibake with trademark sign at the end of a word", - "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", - "original": "OÙ ET QUAND?", - "fixed": "OÙ ET QUAND?", - "expect": "pass" - } -] \ No newline at end of file From 8f8ee2c41b69157ac1dc1e46b9135da4eb3a03e9 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 30 Oct 2024 16:45:36 -0400 Subject: [PATCH 5/6] more ruff checks --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13aeee9..130dec2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH", "FURB"] ignore = [ "ANN101", "ANN401", From 74dd0452b48286a3770013b3a02755313bd5575e Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Wed, 30 Oct 2024 17:00:25 -0400 Subject: [PATCH 6/6] load test data from a directory --- tests/test-cases/README.md | 20 ++++++++++++++++++++ tests/test_examples_in_json.py | 13 +++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 tests/test-cases/README.md diff --git a/tests/test-cases/README.md b/tests/test-cases/README.md new file mode 100644 index 0000000..673bd5f --- /dev/null +++ b/tests/test-cases/README.md @@ -0,0 +1,20 @@ +# ftfy test cases + +This directory contains JSON files with test cases for ftfy. Many of them are real mojibake found in the wild, such as by listening to the Twitter firehose (when that existed), searching through the OSCAR web crawl, or in issue reports from users. + +Cases labeled "synthetic" were not found in the wild, but were instead constructed to test a particular edge case. + +Cases labeled "negative" are not mojibake but look lke they could be. We're testing that ftfy does not alter the text (except for its usual processing such as un-curling quotes). + +`known-failures.json` contains cases that we would do better at with an improved heuristic. Most of these are false negatives, where ftfy does not figure out how to fix the text. ftfy aims to have no false positives, but there is one synthetic false positive in `known-failures.json`. + +## Structure of a test case + +A test case contains the following fields: + +- `label`: A description of the test case, shown when pytest runs in verbose mode. +- `comment`: Further details on the test case because JSON doesn't have comments. +- `original`: The text to run through ftfy. +- `fixed-encoding` (optional): the expected result of `ftfy.fix_encoding(original)`. If unspecified, uses the value from `fixed`. +- `fixed`: the expected result of `ftfy.fix_text(original)`. +- `expect`: "pass" for test cases that should pass, or "fail" for known failures. \ No newline at end of file diff --git a/tests/test_examples_in_json.py b/tests/test_examples_in_json.py index 83dcb8e..2be9eb4 100644 --- a/tests/test_examples_in_json.py +++ b/tests/test_examples_in_json.py @@ -32,8 +32,17 @@ from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text THIS_DIR = Path(__file__).parent -TEST_FILENAME = THIS_DIR / "test_cases.json" -TEST_DATA = json.load(TEST_FILENAME.open(encoding="utf-8")) +TEST_CASE_DIR = THIS_DIR / "test-cases" + + +def load_test_data() -> list[dict]: + test_data = [] + for filepath in TEST_CASE_DIR.glob("*.json"): + test_data.extend(json.load(filepath.open())) + return test_data + + +TEST_DATA = load_test_data() TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"]