From 4411b2952d5488ecdcc718b728c291ab19fb2f92 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 22:54:33 +0000 Subject: [PATCH 01/34] Bump actions/download-artifact from 3 to 4.1.7 in /.github/workflows Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 3 to 4.1.7. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v3...v4.1.7) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 70e56e0..fd5baec 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -31,7 +31,7 @@ jobs: # IMPORTANT: this permission is mandatory for trusted publishing id-token: write steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4.1.7 - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 From 913c11f7b7061225ba90eb19c865782529327d67 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:00:51 -0400 Subject: [PATCH 02/34] switch from poetry to uv, drop py3.8 --- CHANGELOG.md | 5 + poetry.lock | 763 ---------------------------------------------- poetry.toml | 3 - pyproject.toml | 36 ++- setup.cfg | 2 - setup.py | 56 ---- tests/__init__.py | 0 tox.ini | 2 +- uv.lock | 101 ++++++ 9 files changed, 124 insertions(+), 844 deletions(-) delete mode 100644 poetry.lock delete mode 100644 poetry.toml delete mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 uv.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 96d4ea8..ab85e10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## Version 6.3.0 (Octuber 8, 2024) + +- Switched packaging from poetry to uv. +- Uses modern Python packaging exclusively (no setup.py). + ## Version 6.2.3 (August 5, 2024) - Updated PyPI metadata. diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index 800fc21..0000000 --- a/poetry.lock +++ /dev/null @@ -1,763 +0,0 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. - -[[package]] -name = "alabaster" -version = "0.7.13" -description = "A configurable sidebar-enabled Sphinx theme" -optional = false -python-versions = ">=3.6" -files = [ - {file = "alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3"}, - {file = "alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2"}, -] - -[[package]] -name = "babel" -version = "2.15.0" -description = "Internationalization utilities" -optional = false -python-versions = ">=3.8" -files = [ - {file = "Babel-2.15.0-py3-none-any.whl", hash = "sha256:08706bdad8d0a3413266ab61bd6c34d0c28d6e1e7badf40a2cebe67644e2e1fb"}, - {file = "babel-2.15.0.tar.gz", hash = "sha256:8daf0e265d05768bc6c7a314cf1321e9a123afc328cc635c18622a2f30a04413"}, -] - -[package.dependencies] -pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} - -[package.extras] -dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] - -[[package]] -name = "beautifulsoup4" -version = "4.12.3" -description = "Screen-scraping library" -optional = false -python-versions = ">=3.6.0" -files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, -] - -[package.dependencies] -soupsieve = ">1.2" - -[package.extras] -cchardet = ["cchardet"] -chardet = ["chardet"] -charset-normalizer = ["charset-normalizer"] -html5lib = ["html5lib"] -lxml = ["lxml"] - -[[package]] -name = "certifi" -version = "2024.7.4" -description = "Python package for providing Mozilla's CA Bundle." -optional = false -python-versions = ">=3.6" -files = [ - {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, - {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, -] - -[[package]] -name = "charset-normalizer" -version = "3.3.2" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, -] - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -files = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] - -[[package]] -name = "docutils" -version = "0.20.1" -description = "Docutils -- Python Documentation Utilities" -optional = false -python-versions = ">=3.7" -files = [ - {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, - {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, -] - -[[package]] -name = "exceptiongroup" -version = "1.2.2" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, - {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, -] - -[package.extras] -test = ["pytest (>=6)"] - -[[package]] -name = "furo" -version = "2024.7.18" -description = "A clean customisable Sphinx documentation theme." -optional = false -python-versions = ">=3.8" -files = [ - {file = "furo-2024.7.18-py3-none-any.whl", hash = "sha256:b192c7c1f59805494c8ed606d9375fdac6e6ba8178e747e72bc116745fb7e13f"}, - {file = "furo-2024.7.18.tar.gz", hash = "sha256:37b08c5fccc95d46d8712c8be97acd46043963895edde05b0f4f135d58325c83"}, -] - -[package.dependencies] -beautifulsoup4 = "*" -pygments = ">=2.7" -sphinx = ">=6.0,<8.0" -sphinx-basic-ng = ">=1.0.0.beta2" - -[[package]] -name = "idna" -version = "3.7" -description = "Internationalized Domain Names in Applications (IDNA)" -optional = false -python-versions = ">=3.5" -files = [ - {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, - {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, -] - -[[package]] -name = "imagesize" -version = "1.4.1" -description = "Getting image size from png/jpeg/jpeg2000/gif file" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, - {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, -] - -[[package]] -name = "importlib-metadata" -version = "8.2.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_metadata-8.2.0-py3-none-any.whl", hash = "sha256:11901fa0c2f97919b288679932bb64febaeacf289d18ac84dd68cb2e74213369"}, - {file = "importlib_metadata-8.2.0.tar.gz", hash = "sha256:72e8d4399996132204f9a16dcc751af254a48f8d1b20b9ff0f98d4a8f901e73d"}, -] - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -perf = ["ipython"] -test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] - -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -optional = false -python-versions = ">=3.7" -files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - -[[package]] -name = "jinja2" -version = "3.1.4" -description = "A very fast and expressive template engine." -optional = false -python-versions = ">=3.7" -files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, -] - -[package.dependencies] -MarkupSafe = ">=2.0" - -[package.extras] -i18n = ["Babel (>=2.7)"] - -[[package]] -name = "markupsafe" -version = "2.1.5" -description = "Safely add untrusted strings to HTML/XML markup." -optional = false -python-versions = ">=3.7" -files = [ - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, - {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, -] - -[[package]] -name = "mypy" -version = "1.11.1" -description = "Optional static typing for Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "mypy-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a32fc80b63de4b5b3e65f4be82b4cfa362a46702672aa6a0f443b4689af7008c"}, - {file = "mypy-1.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c1952f5ea8a5a959b05ed5f16452fddadbaae48b5d39235ab4c3fc444d5fd411"}, - {file = "mypy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1e30dc3bfa4e157e53c1d17a0dad20f89dc433393e7702b813c10e200843b03"}, - {file = "mypy-1.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2c63350af88f43a66d3dfeeeb8d77af34a4f07d760b9eb3a8697f0386c7590b4"}, - {file = "mypy-1.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:a831671bad47186603872a3abc19634f3011d7f83b083762c942442d51c58d58"}, - {file = "mypy-1.11.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7b6343d338390bb946d449677726edf60102a1c96079b4f002dedff375953fc5"}, - {file = "mypy-1.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4fe9f4e5e521b458d8feb52547f4bade7ef8c93238dfb5bbc790d9ff2d770ca"}, - {file = "mypy-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:886c9dbecc87b9516eff294541bf7f3655722bf22bb898ee06985cd7269898de"}, - {file = "mypy-1.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fca4a60e1dd9fd0193ae0067eaeeb962f2d79e0d9f0f66223a0682f26ffcc809"}, - {file = "mypy-1.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0bd53faf56de9643336aeea1c925012837432b5faf1701ccca7fde70166ccf72"}, - {file = "mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f39918a50f74dc5969807dcfaecafa804fa7f90c9d60506835036cc1bc891dc8"}, - {file = "mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bc71d1fb27a428139dd78621953effe0d208aed9857cb08d002280b0422003a"}, - {file = "mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b868d3bcff720dd7217c383474008ddabaf048fad8d78ed948bb4b624870a417"}, - {file = "mypy-1.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a707ec1527ffcdd1c784d0924bf5cb15cd7f22683b919668a04d2b9c34549d2e"}, - {file = "mypy-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:64f4a90e3ea07f590c5bcf9029035cf0efeae5ba8be511a8caada1a4893f5525"}, - {file = "mypy-1.11.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:749fd3213916f1751fff995fccf20c6195cae941dc968f3aaadf9bb4e430e5a2"}, - {file = "mypy-1.11.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b639dce63a0b19085213ec5fdd8cffd1d81988f47a2dec7100e93564f3e8fb3b"}, - {file = "mypy-1.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c956b49c5d865394d62941b109728c5c596a415e9c5b2be663dd26a1ff07bc0"}, - {file = "mypy-1.11.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45df906e8b6804ef4b666af29a87ad9f5921aad091c79cc38e12198e220beabd"}, - {file = "mypy-1.11.1-cp38-cp38-win_amd64.whl", hash = "sha256:d44be7551689d9d47b7abc27c71257adfdb53f03880841a5db15ddb22dc63edb"}, - {file = "mypy-1.11.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2684d3f693073ab89d76da8e3921883019ea8a3ec20fa5d8ecca6a2db4c54bbe"}, - {file = "mypy-1.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:79c07eb282cb457473add5052b63925e5cc97dfab9812ee65a7c7ab5e3cb551c"}, - {file = "mypy-1.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11965c2f571ded6239977b14deebd3f4c3abd9a92398712d6da3a772974fad69"}, - {file = "mypy-1.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a2b43895a0f8154df6519706d9bca8280cda52d3d9d1514b2d9c3e26792a0b74"}, - {file = "mypy-1.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:1a81cf05975fd61aec5ae16501a091cfb9f605dc3e3c878c0da32f250b74760b"}, - {file = "mypy-1.11.1-py3-none-any.whl", hash = "sha256:0624bdb940255d2dd24e829d99a13cfeb72e4e9031f9492148f410ed30bcab54"}, - {file = "mypy-1.11.1.tar.gz", hash = "sha256:f404a0b069709f18bbdb702eb3dcfe51910602995de00bd39cea3050b5772d08"}, -] - -[package.dependencies] -mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.6.0" - -[package.extras] -dmypy = ["psutil (>=4.0)"] -install-types = ["pip"] -mypyc = ["setuptools (>=50)"] -reports = ["lxml"] - -[[package]] -name = "mypy-extensions" -version = "1.0.0" -description = "Type system extensions for programs checked with the mypy type checker." -optional = false -python-versions = ">=3.5" -files = [ - {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, - {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, -] - -[[package]] -name = "packaging" -version = "24.1" -description = "Core utilities for Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, -] - -[[package]] -name = "pluggy" -version = "1.5.0" -description = "plugin and hook calling mechanisms for python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[[package]] -name = "pygments" -version = "2.18.0" -description = "Pygments is a syntax highlighting package written in Python." -optional = false -python-versions = ">=3.8" -files = [ - {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, - {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, -] - -[package.extras] -windows-terminal = ["colorama (>=0.4.6)"] - -[[package]] -name = "pytest" -version = "8.3.2" -description = "pytest: simple powerful testing with Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, - {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=1.5,<2" -tomli = {version = ">=1", markers = "python_version < \"3.11\""} - -[package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] - -[[package]] -name = "pytz" -version = "2024.1" -description = "World timezone definitions, modern and historical" -optional = false -python-versions = "*" -files = [ - {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, - {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, -] - -[[package]] -name = "requests" -version = "2.32.3" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.8" -files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[[package]] -name = "ruff" -version = "0.5.6" -description = "An extremely fast Python linter and code formatter, written in Rust." -optional = false -python-versions = ">=3.7" -files = [ - {file = "ruff-0.5.6-py3-none-linux_armv6l.whl", hash = "sha256:a0ef5930799a05522985b9cec8290b185952f3fcd86c1772c3bdbd732667fdcd"}, - {file = "ruff-0.5.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b652dc14f6ef5d1552821e006f747802cc32d98d5509349e168f6bf0ee9f8f42"}, - {file = "ruff-0.5.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80521b88d26a45e871f31e4b88938fd87db7011bb961d8afd2664982dfc3641a"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9bc8f328a9f1309ae80e4d392836e7dbc77303b38ed4a7112699e63d3b066ab"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d394940f61f7720ad371ddedf14722ee1d6250fd8d020f5ea5a86e7be217daf"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111a99cdb02f69ddb2571e2756e017a1496c2c3a2aeefe7b988ddab38b416d36"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e395daba77a79f6dc0d07311f94cc0560375ca20c06f354c7c99af3bf4560c5d"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c476acb43c3c51e3c614a2e878ee1589655fa02dab19fe2db0423a06d6a5b1b6"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2ff8003f5252fd68425fd53d27c1f08b201d7ed714bb31a55c9ac1d4c13e2eb"}, - {file = "ruff-0.5.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c94e084ba3eaa80c2172918c2ca2eb2230c3f15925f4ed8b6297260c6ef179ad"}, - {file = "ruff-0.5.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1f77c1c3aa0669fb230b06fb24ffa3e879391a3ba3f15e3d633a752da5a3e670"}, - {file = "ruff-0.5.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f908148c93c02873210a52cad75a6eda856b2cbb72250370ce3afef6fb99b1ed"}, - {file = "ruff-0.5.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:563a7ae61ad284187d3071d9041c08019975693ff655438d8d4be26e492760bd"}, - {file = "ruff-0.5.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:94fe60869bfbf0521e04fd62b74cbca21cbc5beb67cbb75ab33fe8c174f54414"}, - {file = "ruff-0.5.6-py3-none-win32.whl", hash = "sha256:e6a584c1de6f8591c2570e171cc7ce482bb983d49c70ddf014393cd39e9dfaed"}, - {file = "ruff-0.5.6-py3-none-win_amd64.whl", hash = "sha256:d7fe7dccb1a89dc66785d7aa0ac283b2269712d8ed19c63af908fdccca5ccc1a"}, - {file = "ruff-0.5.6-py3-none-win_arm64.whl", hash = "sha256:57c6c0dd997b31b536bff49b9eee5ed3194d60605a4427f735eeb1f9c1b8d264"}, - {file = "ruff-0.5.6.tar.gz", hash = "sha256:07c9e3c2a8e1fe377dd460371c3462671a728c981c3205a5217291422209f642"}, -] - -[[package]] -name = "snowballstemmer" -version = "2.2.0" -description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." -optional = false -python-versions = "*" -files = [ - {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, - {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, -] - -[[package]] -name = "soupsieve" -version = "2.5" -description = "A modern CSS selector implementation for Beautiful Soup." -optional = false -python-versions = ">=3.8" -files = [ - {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, - {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, -] - -[[package]] -name = "sphinx" -version = "7.1.2" -description = "Python documentation generator" -optional = false -python-versions = ">=3.8" -files = [ - {file = "sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe"}, - {file = "sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f"}, -] - -[package.dependencies] -alabaster = ">=0.7,<0.8" -babel = ">=2.9" -colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} -docutils = ">=0.18.1,<0.21" -imagesize = ">=1.3" -importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""} -Jinja2 = ">=3.0" -packaging = ">=21.0" -Pygments = ">=2.13" -requests = ">=2.25.0" -snowballstemmer = ">=2.0" -sphinxcontrib-applehelp = "*" -sphinxcontrib-devhelp = "*" -sphinxcontrib-htmlhelp = ">=2.0.0" -sphinxcontrib-jsmath = "*" -sphinxcontrib-qthelp = "*" -sphinxcontrib-serializinghtml = ">=1.1.5" - -[package.extras] -docs = ["sphinxcontrib-websupport"] -lint = ["docutils-stubs", "flake8 (>=3.5.0)", "flake8-simplify", "isort", "mypy (>=0.990)", "ruff", "sphinx-lint", "types-requests"] -test = ["cython", "filelock", "html5lib", "pytest (>=4.6)"] - -[[package]] -name = "sphinx-basic-ng" -version = "1.0.0b2" -description = "A modern skeleton for Sphinx themes." -optional = false -python-versions = ">=3.7" -files = [ - {file = "sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b"}, - {file = "sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9"}, -] - -[package.dependencies] -sphinx = ">=4.0" - -[package.extras] -docs = ["furo", "ipython", "myst-parser", "sphinx-copybutton", "sphinx-inline-tabs"] - -[[package]] -name = "sphinxcontrib-applehelp" -version = "1.0.4" -description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" -optional = false -python-versions = ">=3.8" -files = [ - {file = "sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e"}, - {file = "sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228"}, -] - -[package.extras] -lint = ["docutils-stubs", "flake8", "mypy"] -test = ["pytest"] - -[[package]] -name = "sphinxcontrib-devhelp" -version = "1.0.2" -description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document." -optional = false -python-versions = ">=3.5" -files = [ - {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"}, - {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"}, -] - -[package.extras] -lint = ["docutils-stubs", "flake8", "mypy"] -test = ["pytest"] - -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.0.1" -description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff"}, - {file = "sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903"}, -] - -[package.extras] -lint = ["docutils-stubs", "flake8", "mypy"] -test = ["html5lib", "pytest"] - -[[package]] -name = "sphinxcontrib-jsmath" -version = "1.0.1" -description = "A sphinx extension which renders display math in HTML via JavaScript" -optional = false -python-versions = ">=3.5" -files = [ - {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, - {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, -] - -[package.extras] -test = ["flake8", "mypy", "pytest"] - -[[package]] -name = "sphinxcontrib-qthelp" -version = "1.0.3" -description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document." -optional = false -python-versions = ">=3.5" -files = [ - {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"}, - {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"}, -] - -[package.extras] -lint = ["docutils-stubs", "flake8", "mypy"] -test = ["pytest"] - -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "1.1.5" -description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)." -optional = false -python-versions = ">=3.5" -files = [ - {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"}, - {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"}, -] - -[package.extras] -lint = ["docutils-stubs", "flake8", "mypy"] -test = ["pytest"] - -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - -[[package]] -name = "typing-extensions" -version = "4.12.2" -description = "Backported and Experimental Type Hints for Python 3.8+" -optional = false -python-versions = ">=3.8" -files = [ - {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, - {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, -] - -[[package]] -name = "urllib3" -version = "2.2.2" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.8" -files = [ - {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, - {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -h2 = ["h2 (>=4,<5)"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - -[[package]] -name = "wcwidth" -version = "0.2.13" -description = "Measures the displayed width of unicode strings in a terminal" -optional = false -python-versions = "*" -files = [ - {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, - {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, -] - -[[package]] -name = "zipp" -version = "3.19.2" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zipp-3.19.2-py3-none-any.whl", hash = "sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c"}, - {file = "zipp-3.19.2.tar.gz", hash = "sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19"}, -] - -[package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] - -[metadata] -lock-version = "2.0" -python-versions = ">=3.8.1,<4" -content-hash = "d3d8b680e9511dc48a9ce073ee651541f50b4904b4b1a410266242a83059b98f" diff --git a/poetry.toml b/poetry.toml deleted file mode 100644 index 53b35d3..0000000 --- a/poetry.toml +++ /dev/null @@ -1,3 +0,0 @@ -[virtualenvs] -create = true -in-project = true diff --git a/pyproject.toml b/pyproject.toml index 1149606..e809d5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ -[tool.poetry] +[project] name = "ftfy" -version = "6.2.3" +version = "6.3.0" description = "Fixes mojibake and other problems with Unicode, after the fact" homepage = "https://ftfy.readthedocs.io/en/latest/" documentation = "https://ftfy.readthedocs.io/en/latest/" repository = "https://github.com/rspeer/python-ftfy" -authors = ["Robyn Speer "] +authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net"}] license = "Apache-2.0" include = [ { path = "README.md", format = "sdist" }, @@ -13,34 +13,32 @@ include = [ { path = "tests", format = "sdist" }, ] readme = "README.md" +dependencies = [ + "wcwidth" +] +requires-python = ">=3.9" -[tool.poetry.dependencies] -python = ">=3.8.1,<4" -wcwidth = "^0.2.12" - -[tool.poetry.group.dev.dependencies] -mypy = "^1.7.0" -Sphinx = ">=7, <8" -furo = ">=2024.7.18" -pytest = "^8.3.2" -ruff = "^0.5.6" - -[tool.poetry.scripts] +[project.scripts] ftfy = "ftfy.cli:main" -[tool.poetry.urls] +[project.urls] Issues = "https://github.com/rspeer/python-ftfy/issues/" Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md" Cohost = "https://cohost.org/arborelia" [build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.uv] +dev-dependencies = [ + "pytest" +] [tool.ruff] exclude = ["badness.py"] line-length = 100 -target-version = "py38" +target-version = "py39" [tool.ruff.lint] select = ["B", "F", "I", "N", "ANN"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index b7e4789..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[aliases] -test=pytest diff --git a/setup.py b/setup.py deleted file mode 100644 index 10789bf..0000000 --- a/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys - -from setuptools import setup - -# Before we get to the rest of setup, with dependencies on setuptools and the -# Python 3 standard library, let's make sure we're not on Python 2 and provide -# a helpful message if we are. - -PY2_MESSAGE = "Python 2 is no longer supported. Please upgrade." - - -if sys.version_info[0] < 3: - print(PY2_MESSAGE) - readable_version = sys.version.split(" ")[0] - print("The version of Python you're running is: %s" % readable_version) - print("Python is running from: %r" % sys.executable) - sys.exit(1) - - -DESCRIPTION = open("README.md", encoding="utf-8").read() - -setup( - name="ftfy", - version="6.2.3", - maintainer="Robyn Speer", - maintainer_email="rspeer@arborelia.net", - license="Apache 2.0", - url="http://github.com/rspeer/python-ftfy", - platforms=["any"], - description="Fixes some problems with Unicode text after the fact", - long_description=DESCRIPTION, - long_description_content_type="text/markdown", - packages=["ftfy", "ftfy.bad_codecs"], - install_requires=["wcwidth"], - tests_require=["pytest"], - python_requires=">=3.8", - classifiers=[ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Filters", - "Development Status :: 5 - Production/Stable", - ], - entry_points={"console_scripts": ["ftfy = ftfy.cli:main"]}, - extras_require={"docs": ["furo", "sphinx"]}, - project_urls={ - "Documentation": "http://ftfy.readthedocs.io", - }, -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tox.ini b/tox.ini index 1f6e83c..ec356b7 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313 [testenv] deps = diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..373bd6a --- /dev/null +++ b/uv.lock @@ -0,0 +1,101 @@ +version = 1 +requires-python = ">=3.9" + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, +] + +[[package]] +name = "ftfy" +version = "6.3.0" +source = { editable = "." } +dependencies = [ + { name = "wcwidth" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [{ name = "wcwidth" }] + +[package.metadata.requires-dev] +dev = [{ name = "pytest" }] + +[[package]] +name = "iniconfig" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, +] + +[[package]] +name = "packaging" +version = "24.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/65/50db4dda066951078f0a96cf12f4b9ada6e4b811516bf0262c0f4f7064d4/packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002", size = 148788 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", size = 53985 }, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, +] + +[[package]] +name = "pytest" +version = "8.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/6c/62bbd536103af674e227c41a8f3dcd022d591f6eed5facb5a0f31ee33bbc/pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181", size = 1442487 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, +] + +[[package]] +name = "tomli" +version = "2.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/b9/de2a5c0144d7d75a57ff355c0c24054f965b2dc3036456ae03a51ea6264b/tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed", size = 16096 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237 }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, +] From 21a31d8218c09f6d5d655b6bf96f5318111842bd Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:05:11 -0400 Subject: [PATCH 03/34] add tox dev-dependency --- pyproject.toml | 3 +- uv.lock | 109 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e809d5b..fab6d2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ build-backend = "hatchling.build" [tool.uv] dev-dependencies = [ - "pytest" + "pytest", + "tox" ] [tool.ruff] diff --git a/uv.lock b/uv.lock index 373bd6a..60a177b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,24 @@ version = 1 requires-python = ">=3.9" +[[package]] +name = "cachetools" +version = "5.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/38/a0f315319737ecf45b4319a8cd1f3a908e29d9277b46942263292115eee7/cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a", size = 27661 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292", size = 9524 }, +] + +[[package]] +name = "chardet" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -10,6 +28,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "distlib" +version = "0.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c4/91/e2df406fb4efacdf46871c25cde65d3c6ee5e173b7e5a4547a47bae91920/distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64", size = 609931 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784", size = 468850 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -19,6 +46,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, ] +[[package]] +name = "filelock" +version = "3.16.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 }, +] + [[package]] name = "ftfy" version = "6.3.0" @@ -30,13 +66,17 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "pytest" }, + { name = "tox" }, ] [package.metadata] requires-dist = [{ name = "wcwidth" }] [package.metadata.requires-dev] -dev = [{ name = "pytest" }] +dev = [ + { name = "pytest" }, + { name = "tox" }, +] [[package]] name = "iniconfig" @@ -56,6 +96,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", size = 53985 }, ] +[[package]] +name = "platformdirs" +version = "4.3.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 }, +] + [[package]] name = "pluggy" version = "1.5.0" @@ -65,6 +114,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "pyproject-api" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/19/441e0624a8afedd15bbcce96df1b80479dd0ff0d965f5ce8fde4f2f6ffad/pyproject_api-1.8.0.tar.gz", hash = "sha256:77b8049f2feb5d33eefcc21b57f1e279636277a8ac8ad6b5871037b243778496", size = 22340 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ba/f4/3c4ddfcc0c19c217c6de513842d286de8021af2f2ab79bbb86c00342d778/pyproject_api-1.8.0-py3-none-any.whl", hash = "sha256:3d7d347a047afe796fd5d1885b1e391ba29be7169bd2f102fcd378f04273d228", size = 13100 }, +] + [[package]] name = "pytest" version = "8.3.3" @@ -91,6 +153,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237 }, ] +[[package]] +name = "tox" +version = "4.21.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "chardet" }, + { name = "colorama" }, + { name = "filelock" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "pluggy" }, + { name = "pyproject-api" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, + { name = "virtualenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/db/ba5b1a4cf664f221a33c3cbb1adf40ccccbbd13f5eec6d9d7291c7a39e44/tox-4.21.2.tar.gz", hash = "sha256:49381ff102296753e378fa5ff30e42a35e695f149b4dbf8a2c49d15fdb5797b2", size = 188539 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/20/168300f3f334e255b618322dce14b86a5c423aab05f28be14d1a2d6af14a/tox-4.21.2-py3-none-any.whl", hash = "sha256:13d996adcd792e7c82994b0e116d85efd84f0c6d185254d83d156f73f86b2038", size = 165698 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + +[[package]] +name = "virtualenv" +version = "20.26.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/40/abc5a766da6b0b2457f819feab8e9203cbeae29327bd241359f866a3da9d/virtualenv-20.26.6.tar.gz", hash = "sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48", size = 9372482 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/90/57b8ac0c8a231545adc7698c64c5a36fa7cd8e376c691b9bde877269f2eb/virtualenv-20.26.6-py3-none-any.whl", hash = "sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2", size = 5999862 }, +] + [[package]] name = "wcwidth" version = "0.2.13" From 628cbb789914d1c83a10f208e53175a3521cc500 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:05:31 -0400 Subject: [PATCH 04/34] ruff format --- ftfy/__init__.py | 20 +++++--------------- ftfy/bad_codecs/sloppy.py | 8 ++------ ftfy/bad_codecs/utf8_variants.py | 8 ++------ ftfy/chardata.py | 4 +--- ftfy/cli.py | 9 +++------ tests/test_encodings.py | 4 +--- 6 files changed, 14 insertions(+), 39 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 6b97b58..4d5c674 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -232,9 +232,7 @@ class TextFixerConfig(NamedTuple): explain: bool = True -def _config_from_kwargs( - config: TextFixerConfig, kwargs: Dict[str, Any] -) -> TextFixerConfig: +def _config_from_kwargs(config: TextFixerConfig, kwargs: Dict[str, Any]) -> TextFixerConfig: """ Handle parameters provided as keyword arguments to ftfy's top-level functions, converting them into a TextFixerConfig. @@ -470,9 +468,7 @@ def fix_encoding_and_explain( return ExplainedText(text, plan_so_far) -def _fix_encoding_one_step_and_explain( - text: str, config: TextFixerConfig -) -> ExplainedText: +def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ @@ -518,9 +514,7 @@ def _fix_encoding_one_step_and_explain( ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append( - ExplanationStep("transcode", "restore_byte_a0") - ) + transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost @@ -588,9 +582,7 @@ def _fix_encoding_one_step_and_explain( return ExplainedText(text, []) -def fix_encoding( - text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any -) -> str: +def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) -> str: """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -611,9 +603,7 @@ def fix_encoding( ftfy = fix_text -def fix_text_segment( - text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any -) -> str: +def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) -> str: """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. diff --git a/ftfy/bad_codecs/sloppy.py b/ftfy/bad_codecs/sloppy.py index 6f63b1e..a64f072 100644 --- a/ftfy/bad_codecs/sloppy.py +++ b/ftfy/bad_codecs/sloppy.py @@ -121,14 +121,10 @@ def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: # `encodings.cp1252` for comparison; this is almost exactly the same, # except I made it follow pep8. class Codec(codecs.Codec): - def encode( - self, input: str, errors: Optional[str] = "strict" - ) -> Tuple[bytes, int]: + def encode(self, input: str, errors: Optional[str] = "strict") -> Tuple[bytes, int]: return codecs.charmap_encode(input, errors, encoding_table) - def decode( - self, input: bytes, errors: Optional[str] = "strict" - ) -> Tuple[str, int]: + def decode(self, input: bytes, errors: Optional[str] = "strict") -> Tuple[str, int]: return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type] class IncrementalEncoder(codecs.IncrementalEncoder): diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index 28366a5..5bb6bdd 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -137,9 +137,7 @@ def _buffer_decode( # type: ignore[override] return "".join(decoded_segments), position @staticmethod - def _buffer_decode_step( - input: bytes, errors: Optional[str], final: bool - ) -> Tuple[str, int]: + def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> Tuple[str, int]: """ There are three possibilities for each decoding step: @@ -178,9 +176,7 @@ def _buffer_decode_step( return "", 0 else: # Decode a possible six-byte sequence starting with 0xed. - return IncrementalDecoder._buffer_decode_surrogates( - sup, input, errors, final - ) + return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) @staticmethod def _buffer_decode_surrogates( diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 2e0e82a..89d9b2c 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -310,8 +310,6 @@ def _build_width_map() -> Dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ -""".format( - **UTF8_CLUES - ), +""".format(**UTF8_CLUES), re.VERBOSE, ) diff --git a/ftfy/cli.py b/ftfy/cli.py index dfb2e93..f233ad8 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -54,8 +54,7 @@ def main() -> None: "filename", default="-", nargs="?", - help="The file whose Unicode is to be fixed. Defaults " - "to -, meaning standard input.", + help="The file whose Unicode is to be fixed. Defaults " "to -, meaning standard input.", ) parser.add_argument( "-o", @@ -68,8 +67,7 @@ def main() -> None: "-g", "--guess", action="store_true", - help="Ask ftfy to guess the encoding of your input. " - "This is risky. Overrides -e.", + help="Ask ftfy to guess the encoding of your input. " "This is risky. Overrides -e.", ) parser.add_argument( "-e", @@ -83,8 +81,7 @@ def main() -> None: "--normalization", type=str, default="NFC", - help="The normalization of Unicode to apply. " - 'Defaults to NFC. Can be "none".', + help="The normalization of Unicode to apply. " 'Defaults to NFC. Can be "none".', ) parser.add_argument( "--preserve-entities", diff --git a/tests/test_encodings.py b/tests/test_encodings.py index 037404c..c3c9c2e 100644 --- a/tests/test_encodings.py +++ b/tests/test_encodings.py @@ -6,9 +6,7 @@ def test_cesu8(): cls2 = bad_codecs.search_function("cesu-8").__class__ assert cls1 == cls2 - test_bytes = ( - b"\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, and \xc0\x80 is null" - ) + test_bytes = b"\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, and \xc0\x80 is null" test_text = "\U00077777 is an unassigned character, and \x00 is null" assert test_bytes.decode("cesu8") == test_text From 850899910c25cecd31bd08d8549cb521d0a3add0 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:06:38 -0400 Subject: [PATCH 05/34] exclude notebook from ruff formatting --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fab6d2b..619db65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dev-dependencies = [ ] [tool.ruff] -exclude = ["badness.py"] +exclude = ["badness.py", "notebook"] line-length = 100 target-version = "py39" From 3bfaf50ab875bd3b0d0cdeb421f09be7b1f4a477 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:08:15 -0400 Subject: [PATCH 06/34] format fixes on multiline strings --- ftfy/cli.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ftfy/cli.py b/ftfy/cli.py index f233ad8..fa4e2a2 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -54,20 +54,20 @@ def main() -> None: "filename", default="-", nargs="?", - help="The file whose Unicode is to be fixed. Defaults " "to -, meaning standard input.", + help="The file whose Unicode is to be fixed. Defaults to -, meaning standard input.", ) parser.add_argument( "-o", "--output", type=str, default="-", - help="The file to output to. Defaults to -, meaning " "standard output.", + help="The file to output to. Defaults to -, meaning standard output.", ) parser.add_argument( "-g", "--guess", action="store_true", - help="Ask ftfy to guess the encoding of your input. " "This is risky. Overrides -e.", + help="Ask ftfy to guess the encoding of your input. This is risky. Overrides -e.", ) parser.add_argument( "-e", @@ -81,14 +81,13 @@ def main() -> None: "--normalization", type=str, default="NFC", - help="The normalization of Unicode to apply. " 'Defaults to NFC. Can be "none".', + help='The normalization of Unicode to apply. Defaults to NFC. Can be "none".', ) parser.add_argument( "--preserve-entities", action="store_true", help="Leave HTML entities as they are. The default " - "is to decode them, as long as no HTML tags " - "have appeared in the file.", + "is to decode them, as long as no HTML tags have appeared in the file.", ) args = parser.parse_args() From e3ecfce96491133a77a0c8a4a187d75d7bcc6549 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:19:45 -0400 Subject: [PATCH 07/34] install tox as a global tool, not a dev-dep --- pyproject.toml | 3 +- uv.lock | 109 +------------------------------------------------ 2 files changed, 2 insertions(+), 110 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 619db65..c5061c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ build-backend = "hatchling.build" [tool.uv] dev-dependencies = [ "pytest", - "tox" ] [tool.ruff] @@ -42,7 +41,7 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN"] +select = ["B", "F", "I", "N", "ANN", "UP"] ignore = ["ANN101", "ANN401"] [tool.ruff.lint.per-file-ignores] diff --git a/uv.lock b/uv.lock index 60a177b..373bd6a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,24 +1,6 @@ version = 1 requires-python = ">=3.9" -[[package]] -name = "cachetools" -version = "5.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/38/a0f315319737ecf45b4319a8cd1f3a908e29d9277b46942263292115eee7/cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a", size = 27661 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292", size = 9524 }, -] - -[[package]] -name = "chardet" -version = "5.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 }, -] - [[package]] name = "colorama" version = "0.4.6" @@ -28,15 +10,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] -[[package]] -name = "distlib" -version = "0.3.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c4/91/e2df406fb4efacdf46871c25cde65d3c6ee5e173b7e5a4547a47bae91920/distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64", size = 609931 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/41/9307e4f5f9976bc8b7fea0b66367734e8faf3ec84bc0d412d8cfabbb66cd/distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784", size = 468850 }, -] - [[package]] name = "exceptiongroup" version = "1.2.2" @@ -46,15 +19,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, ] -[[package]] -name = "filelock" -version = "3.16.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 }, -] - [[package]] name = "ftfy" version = "6.3.0" @@ -66,17 +30,13 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "pytest" }, - { name = "tox" }, ] [package.metadata] requires-dist = [{ name = "wcwidth" }] [package.metadata.requires-dev] -dev = [ - { name = "pytest" }, - { name = "tox" }, -] +dev = [{ name = "pytest" }] [[package]] name = "iniconfig" @@ -96,15 +56,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", size = 53985 }, ] -[[package]] -name = "platformdirs" -version = "4.3.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 }, -] - [[package]] name = "pluggy" version = "1.5.0" @@ -114,19 +65,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] -[[package]] -name = "pyproject-api" -version = "1.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bb/19/441e0624a8afedd15bbcce96df1b80479dd0ff0d965f5ce8fde4f2f6ffad/pyproject_api-1.8.0.tar.gz", hash = "sha256:77b8049f2feb5d33eefcc21b57f1e279636277a8ac8ad6b5871037b243778496", size = 22340 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/f4/3c4ddfcc0c19c217c6de513842d286de8021af2f2ab79bbb86c00342d778/pyproject_api-1.8.0-py3-none-any.whl", hash = "sha256:3d7d347a047afe796fd5d1885b1e391ba29be7169bd2f102fcd378f04273d228", size = 13100 }, -] - [[package]] name = "pytest" version = "8.3.3" @@ -153,51 +91,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237 }, ] -[[package]] -name = "tox" -version = "4.21.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cachetools" }, - { name = "chardet" }, - { name = "colorama" }, - { name = "filelock" }, - { name = "packaging" }, - { name = "platformdirs" }, - { name = "pluggy" }, - { name = "pyproject-api" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, - { name = "virtualenv" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2a/db/ba5b1a4cf664f221a33c3cbb1adf40ccccbbd13f5eec6d9d7291c7a39e44/tox-4.21.2.tar.gz", hash = "sha256:49381ff102296753e378fa5ff30e42a35e695f149b4dbf8a2c49d15fdb5797b2", size = 188539 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/20/168300f3f334e255b618322dce14b86a5c423aab05f28be14d1a2d6af14a/tox-4.21.2-py3-none-any.whl", hash = "sha256:13d996adcd792e7c82994b0e116d85efd84f0c6d185254d83d156f73f86b2038", size = 165698 }, -] - -[[package]] -name = "typing-extensions" -version = "4.12.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, -] - -[[package]] -name = "virtualenv" -version = "20.26.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "distlib" }, - { name = "filelock" }, - { name = "platformdirs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3f/40/abc5a766da6b0b2457f819feab8e9203cbeae29327bd241359f866a3da9d/virtualenv-20.26.6.tar.gz", hash = "sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48", size = 9372482 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/59/90/57b8ac0c8a231545adc7698c64c5a36fa7cd8e376c691b9bde877269f2eb/virtualenv-20.26.6-py3-none-any.whl", hash = "sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2", size = 5999862 }, -] - [[package]] name = "wcwidth" version = "0.2.13" From 2be9647268193668021198e992fe23dd92f146ff Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 02:20:02 -0400 Subject: [PATCH 08/34] run pyupgrade --- docs/conf.py | 1 - ftfy/__init__.py | 45 ++++++++++++++------------------ ftfy/bad_codecs/__init__.py | 4 +-- ftfy/bad_codecs/sloppy.py | 11 ++++---- ftfy/bad_codecs/utf8_variants.py | 14 +++++----- ftfy/chardata.py | 13 +++++---- ftfy/cli.py | 2 +- ftfy/fixes.py | 7 ++--- 8 files changed, 45 insertions(+), 52 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index bca9489..3dcc751 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # ftfy documentation build configuration file, created by # sphinx-quickstart on Wed Aug 28 03:18:27 2013. diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 4d5c674..40044cf 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -9,19 +9,14 @@ import unicodedata import warnings +from collections.abc import Iterator from typing import ( Any, BinaryIO, Callable, - Dict, - Iterator, - List, Literal, NamedTuple, - Optional, TextIO, - Tuple, - Union, cast, ) @@ -73,11 +68,11 @@ class ExplainedText(NamedTuple): """ text: str - explanation: Optional[List[ExplanationStep]] + explanation: list[ExplanationStep] | None # Functions that can be applied using `apply_plan`. -FIXERS: Dict[str, Callable] = { # type: ignore[type-arg] +FIXERS: dict[str, Callable] = { # type: ignore[type-arg] "unescape_html": fixes.unescape_html, "remove_terminal_escapes": fixes.remove_terminal_escapes, "restore_byte_a0": fixes.restore_byte_a0, @@ -214,7 +209,7 @@ class TextFixerConfig(NamedTuple): will automatically set `explain` to False. """ - unescape_html: Union[str, bool] = "auto" + unescape_html: str | bool = "auto" remove_terminal_escapes: bool = True fix_encoding: bool = True restore_byte_a0: bool = True @@ -227,12 +222,12 @@ class TextFixerConfig(NamedTuple): fix_line_breaks: bool = True fix_surrogates: bool = True remove_control_chars: bool = True - normalization: Optional[Literal["NFC", "NFD", "NFKC", "NFKD"]] = "NFC" + normalization: Literal["NFC", "NFD", "NFKC", "NFKD"] | None = "NFC" max_decode_length: int = 1000000 explain: bool = True -def _config_from_kwargs(config: TextFixerConfig, kwargs: Dict[str, Any]) -> TextFixerConfig: +def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: """ Handle parameters provided as keyword arguments to ftfy's top-level functions, converting them into a TextFixerConfig. @@ -274,7 +269,7 @@ def _try_fix( fixer_name: str, text: str, config: TextFixerConfig, - steps: Optional[List[ExplanationStep]], + steps: list[ExplanationStep] | None, ) -> str: """ A helper function used across several 'fixer' steps, deciding whether to @@ -290,7 +285,7 @@ def _try_fix( return text -def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) -> str: +def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: r""" Given Unicode text as input, fix inconsistencies and glitches in it, such as mojibake (text that was decoded in the wrong encoding). @@ -365,7 +360,7 @@ def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) def fix_and_explain( - text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any + text: str, config: TextFixerConfig | None = None, **kwargs: Any ) -> ExplainedText: """ Fix text as a single segment, returning the fixed text and an explanation @@ -384,7 +379,7 @@ def fix_and_explain( config = config._replace(unescape_html=False) if config.explain: - steps: Optional[List[ExplanationStep]] = [] + steps: list[ExplanationStep] | None = [] else: # If explanations aren't desired, `steps` will be None steps = None @@ -425,7 +420,7 @@ def fix_and_explain( def fix_encoding_and_explain( - text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any + text: str, config: TextFixerConfig | None = None, **kwargs: Any ) -> ExplainedText: """ Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed @@ -458,7 +453,7 @@ def fix_encoding_and_explain( # fixing the encoding return ExplainedText(text, []) - plan_so_far: List[ExplanationStep] = [] + plan_so_far: list[ExplanationStep] = [] while True: prevtext = text text, plan = _fix_encoding_one_step_and_explain(text, config) @@ -582,7 +577,7 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex return ExplainedText(text, []) -def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) -> str: +def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -603,7 +598,7 @@ def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs: ftfy = fix_text -def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs: Any) -> str: +def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. @@ -617,8 +612,8 @@ def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwar def fix_file( input_file: TextIO | BinaryIO, - encoding: Optional[str] = None, - config: Optional[TextFixerConfig] = None, + encoding: str | None = None, + config: TextFixerConfig | None = None, **kwargs: Any, ) -> Iterator[str]: """ @@ -648,7 +643,7 @@ def fix_file( yield fixed_line -def guess_bytes(bstring: bytes) -> Tuple[str, str]: +def guess_bytes(bstring: bytes) -> tuple[str, str]: """ NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy is not designed to be an encoding detector. @@ -719,7 +714,7 @@ def guess_bytes(bstring: bytes) -> Tuple[str, str]: return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252" -def apply_plan(text: str, plan: List[Tuple[str, str]]) -> str: +def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: """ Apply a plan for fixing the encoding of text. @@ -753,9 +748,9 @@ def apply_plan(text: str, plan: List[Tuple[str, str]]) -> str: if encoding in FIXERS: obj = FIXERS[encoding](obj) else: - raise ValueError("Unknown function to apply: %s" % encoding) + raise ValueError(f"Unknown function to apply: {encoding}") else: - raise ValueError("Unknown plan step: %s" % operation) + raise ValueError(f"Unknown plan step: {operation}") return obj diff --git a/ftfy/bad_codecs/__init__.py b/ftfy/bad_codecs/__init__.py index bb71136..a449a38 100644 --- a/ftfy/bad_codecs/__init__.py +++ b/ftfy/bad_codecs/__init__.py @@ -32,9 +32,9 @@ import codecs from encodings import normalize_encoding -from typing import Dict, Optional +from typing import Optional -_CACHE: Dict[str, codecs.CodecInfo] = {} +_CACHE: dict[str, codecs.CodecInfo] = {} # Define some aliases for 'utf-8-variants'. All hyphens get turned into # underscores, because of `normalize_encoding`. diff --git a/ftfy/bad_codecs/sloppy.py b/ftfy/bad_codecs/sloppy.py index a64f072..656f01c 100644 --- a/ftfy/bad_codecs/sloppy.py +++ b/ftfy/bad_codecs/sloppy.py @@ -76,7 +76,6 @@ import codecs from encodings import normalize_encoding -from typing import Optional, Tuple REPLACEMENT_CHAR = "\ufffd" @@ -121,10 +120,10 @@ def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: # `encodings.cp1252` for comparison; this is almost exactly the same, # except I made it follow pep8. class Codec(codecs.Codec): - def encode(self, input: str, errors: Optional[str] = "strict") -> Tuple[bytes, int]: + def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]: return codecs.charmap_encode(input, errors, encoding_table) - def decode(self, input: bytes, errors: Optional[str] = "strict") -> Tuple[str, int]: + def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]: return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type] class IncrementalEncoder(codecs.IncrementalEncoder): @@ -156,9 +155,9 @@ class StreamReader(Codec, codecs.StreamReader): # can be used by the main module of ftfy.bad_codecs. CODECS = {} INCOMPLETE_ENCODINGS = ( - ["windows-%s" % num for num in range(1250, 1259)] - + ["iso-8859-%s" % num for num in (3, 6, 7, 8, 11)] - + ["cp%s" % num for num in range(1250, 1259)] + [f"windows-{num}" for num in range(1250, 1259)] + + [f"iso-8859-{num}" for num in (3, 6, 7, 8, 11)] + + [f"cp{num}" for num in range(1250, 1259)] + ["cp874"] ) diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index 5bb6bdd..c15a3cf 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -47,7 +47,7 @@ from encodings.utf_8 import ( IncrementalEncoder as UTF8IncrementalEncoder, ) -from typing import Callable, Optional, Tuple +from typing import Callable, Optional NAME = "utf-8-variants" @@ -95,7 +95,7 @@ class IncrementalDecoder(UTF8IncrementalDecoder): @staticmethod def _buffer_decode( # type: ignore[override] input: bytes, errors: Optional[str], final: bool - ) -> Tuple[str, int]: + ) -> tuple[str, int]: """ Decode bytes that may be arriving in a stream, following the Codecs API. @@ -137,7 +137,7 @@ def _buffer_decode( # type: ignore[override] return "".join(decoded_segments), position @staticmethod - def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> Tuple[str, int]: + def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tuple[str, int]: """ There are three possibilities for each decoding step: @@ -180,11 +180,11 @@ def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> Tup @staticmethod def _buffer_decode_surrogates( - sup: Callable[[bytes, Optional[str], bool], Tuple[str, int]], + sup: Callable[[bytes, Optional[str], bool], tuple[str, int]], input: bytes, errors: Optional[str], final: bool, - ) -> Tuple[str, int]: + ) -> tuple[str, int]: """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. @@ -235,13 +235,13 @@ def _buffer_decode_surrogates( class StreamWriter(codecs.StreamWriter): @staticmethod - def encode(input: str, errors: str = "strict") -> Tuple[bytes, int]: + def encode(input: str, errors: str = "strict") -> tuple[bytes, int]: return IncrementalEncoder(errors).encode(input, final=True), len(input) class StreamReader(codecs.StreamReader): @staticmethod - def decode(input: bytes, errors: str = "strict") -> Tuple[str, int]: + def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: return IncrementalDecoder(errors).decode(input, final=True), len(input) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 89d9b2c..198cbb8 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -9,7 +9,6 @@ import itertools import re import unicodedata -from typing import Dict # These are the encodings we will try to fix in ftfy, in the # order that they should be tried. @@ -29,7 +28,7 @@ DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") -def _build_regexes() -> Dict[str, re.Pattern[str]]: +def _build_regexes() -> dict[str, re.Pattern[str]]: """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is @@ -51,7 +50,7 @@ def _build_regexes() -> Dict[str, re.Pattern[str]]: # support, so we can just include them as ranges. This also lets us # not worry about escaping regex special characters, because all of # them are in the \x1B to \x7F range. - regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist) + regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" encoding_regexes[encoding] = re.compile(regex) return encoding_regexes @@ -59,7 +58,7 @@ def _build_regexes() -> Dict[str, re.Pattern[str]]: ENCODING_REGEXES = _build_regexes() -def _build_html_entities() -> Dict[str, str]: +def _build_html_entities() -> dict[str, str]: entities = {} # Create a dictionary based on the built-in HTML5 entity dictionary. # Add a limited set of HTML entities that we'll also decode if they've @@ -94,13 +93,13 @@ def possible_encoding(text: str, encoding: str) -> bool: return bool(ENCODING_REGEXES[encoding].match(text)) -def _build_control_char_mapping() -> Dict[int, None]: +def _build_control_char_mapping() -> dict[int, None]: """ Build a translate mapping that strips likely-unintended control characters. See :func:`ftfy.fixes.remove_control_chars` for a description of these codepoint ranges and why they should be removed. """ - control_chars: Dict[int, None] = {} + control_chars: dict[int, None] = {} for i in itertools.chain( range(0x00, 0x09), @@ -230,7 +229,7 @@ def _build_control_char_mapping() -> Dict[int, None]: } -def _build_width_map() -> Dict[int, str]: +def _build_width_map() -> dict[int, str]: """ Build a translate mapping that replaces halfwidth and fullwidth forms with their standard-width forms. diff --git a/ftfy/cli.py b/ftfy/cli.py index fa4e2a2..2807a86 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -48,7 +48,7 @@ def main() -> None: import argparse parser = argparse.ArgumentParser( - description="ftfy (fixes text for you), version %s" % __version__ + description=f"ftfy (fixes text for you), version {__version__}" ) parser.add_argument( "filename", diff --git a/ftfy/fixes.py b/ftfy/fixes.py index a248bc8..41d3c2f 100644 --- a/ftfy/fixes.py +++ b/ftfy/fixes.py @@ -14,7 +14,8 @@ import html import re import warnings -from typing import Any, List, Match, Tuple +from re import Match +from typing import Any import ftfy from ftfy.badness import is_bad @@ -57,7 +58,7 @@ def fix_encoding(text: str) -> str: return ftfy.fix_encoding(text) -def apply_plan(text: str, plan: List[Tuple[str, str]]) -> str: +def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: """ Deprecated copy of `ftfy.apply_plan()`. """ @@ -474,7 +475,7 @@ def replace_lossy_sequences(byts: bytes) -> bytes: This is used as a transcoder within `fix_encoding`. """ - return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts) + return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts) def decode_inconsistent_utf8(text: str) -> str: From 27ab581203d51796fad73205ae90ce95bcfc0c91 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 11:43:13 -0400 Subject: [PATCH 09/34] update build/dev instructions --- .github/workflows/publish.yml | 4 ++-- README.md | 16 +++++----------- docs/index.rst | 7 ++++++- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index fd5baec..5488842 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,8 +14,8 @@ jobs: python-version: 3.11 - run: | - pip install poetry - poetry build + pip install hatchling + hatchling build - uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 00df49c..7b5de26 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ ``` -The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are -some links into it: +The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are some links into it: - [Fixing problems and getting explanations](https://ftfy.readthedocs.io/en/latest/explain.html) - [Configuring ftfy](https://ftfy.readthedocs.io/en/latest/config.html) @@ -36,8 +35,6 @@ some links into it: — Brennan Young - “I have no idea when I’m gonna need this, but I’m definitely bookmarking it.” — [/u/ocrow](https://reddit.com/u/ocrow) -- “9.2/10” - — [pylint](https://bitbucket.org/logilab/pylint/) ## What it does @@ -93,14 +90,12 @@ If you use `poetry`, you can use ftfy as a dependency in the usual way (such as ### Local development -ftfy is developed using `poetry`. Its `setup.py` is vestigial and is not the recommended way to install it. - -[Install Poetry](https://python-poetry.org/docs/master/#installing-with-the-official-installer), check out this repository, and run `poetry install` to install ftfy for local development, such as experimenting with the heuristic or running tests. +ftfy is developed using [uv](https://github.com/astral-sh/uv). You can build a virtual environment with its local dependencies by running `uv venv`, and test it with `uv run pytest`. ## Who maintains ftfy? -I'm Robyn Speer, also known as Elia Robyn Lake. You can find me -[on GitHub](https://github.com/rspeer) or [Cohost](https://cohost.org/arborelia). +I'm Robyn Speer, also known as Elia Robyn Lake. You can find my projects +[on GitHub](https://github.com/rspeer) and my posts on [my own blog](https://posts.arborelia.net). ## Citing ftfy @@ -108,8 +103,7 @@ ftfy has been used as a crucial data processing step in major NLP research. It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it. -ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). -A citation of ftfy may look like this: +ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). A citation of ftfy may look like this: Robyn Speer. (2019). ftfy (Version 5.5). Zenodo. http://doi.org/10.5281/zenodo.2591652 diff --git a/docs/index.rst b/docs/index.rst index 7f339d2..a62feed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,12 @@ ftfy: fixes text for you ======================== -*Version 6.2* +*Version 6.3* + +“Assume all external input is the result of (a series of) bugs.” +— `RFC 9225`_: Software Defects Considered Harmful + +.. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html#confirmed **ftfy** fixes Unicode that's broken in various ways. From 57561f96367c22550560d161fd07a31345dc7659 Mon Sep 17 00:00:00 2001 From: arborelia Date: Tue, 8 Oct 2024 11:47:57 -0400 Subject: [PATCH 10/34] update mypy config --- mypy.ini | 2 +- pyproject.toml | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mypy.ini b/mypy.ini index 43839a2..278ee78 100644 --- a/mypy.ini +++ b/mypy.ini @@ -14,7 +14,7 @@ warn_redundant_casts = True warn_return_any = True warn_unused_configs = True warn_unused_ignores = True -python_version = 3.8 +python_version = 3.9 [mypy-wcwidth] ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index c5061c4..f736e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,10 @@ build-backend = "hatchling.build" [tool.uv] dev-dependencies = [ - "pytest", + "Sphinx >=7, <8", + "furo >= 2024.7.18", + "pytest >= 8.3.2, < 9", + "ruff" ] [tool.ruff] From 0a60ddc2907d237769e2e77ee396c11d0109d796 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 18:57:47 -0400 Subject: [PATCH 11/34] work on windows-1257 support --- docs/conf.py | 4 ++-- ftfy/__init__.py | 2 +- ftfy/badness.py | 29 +++++++++++++++++++++++++++-- ftfy/chardata.py | 8 ++++++-- pyproject.toml | 2 +- tests/test_cases.json | 27 +++++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 8 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 3dcc751..1bff5e9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,9 +46,9 @@ # built documents. # # The short X.Y version. -version = "6.2" +version = "6.3" # The full version, including alpha/beta/rc tags. -release = "6.2.3" +release = "6.3.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 40044cf..15d280e 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -24,7 +24,7 @@ from ftfy.badness import is_bad from ftfy.formatting import display_ljust -__version__ = "6.2.3" +__version__ = "6.3.0" # Though this function does nothing, it lets linters know that we're using diff --git a/ftfy/badness.py b/ftfy/badness.py index 81c5274..52ed7ef 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -16,7 +16,7 @@ import re -# There are only 403 characters that occur in known UTF-8 mojibake, and we can +# There are only a few hundred characters that occur in known UTF-8 mojibake, and we can # characterize them: MOJIBAKE_CATEGORIES = { @@ -132,6 +132,9 @@ "ò-ö" "ø-ü" "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" + "\N{LATIN CAPITAL LETTER O WITH MACRON}" + "\N{LATIN CAPITAL LETTER U WITH MACRON}" + "\N{LATIN CAPITAL LETTER U WITH OGONEK}" "\N{DEGREE SIGN}" ), "upper_accented": ( @@ -143,6 +146,7 @@ "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" "\N{LATIN CAPITAL LETTER A WITH BREVE}" + "\N{LATIN CAPITAL LETTER A WITH MACRON}" "\N{LATIN CAPITAL LETTER A WITH OGONEK}" "\N{LATIN CAPITAL LETTER C WITH ACUTE}" "\N{LATIN CAPITAL LETTER C WITH CARON}" @@ -150,13 +154,20 @@ "\N{LATIN CAPITAL LETTER D WITH STROKE}" "\N{LATIN CAPITAL LETTER E WITH OGONEK}" "\N{LATIN CAPITAL LETTER E WITH CARON}" + "\N{LATIN CAPITAL LETTER E WITH MACRON}" + "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" "\N{LATIN CAPITAL LETTER G WITH BREVE}" + "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" + "\N{LATIN CAPITAL LETTER I WITH MACRON}" + "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" "\N{LATIN CAPITAL LETTER L WITH ACUTE}" "\N{LATIN CAPITAL LETTER L WITH CARON}" "\N{LATIN CAPITAL LETTER L WITH STROKE}" + "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" "\N{LATIN CAPITAL LETTER N WITH ACUTE}" "\N{LATIN CAPITAL LETTER N WITH CARON}" + "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" "\N{LATIN CAPITAL LIGATURE OE}" "\N{LATIN CAPITAL LETTER R WITH CARON}" "\N{LATIN CAPITAL LETTER S WITH ACUTE}" @@ -179,16 +190,24 @@ # skip o's and u's that could be used in kaomoji "\N{LATIN SMALL LETTER A WITH BREVE}" "\N{LATIN SMALL LETTER A WITH OGONEK}" + "\N{LATIN SMALL LETTER A WITH MACRON}" "\N{LATIN SMALL LETTER C WITH ACUTE}" "\N{LATIN SMALL LETTER C WITH CARON}" "\N{LATIN SMALL LETTER D WITH CARON}" "\N{LATIN SMALL LETTER D WITH STROKE}" "\N{LATIN SMALL LETTER E WITH OGONEK}" "\N{LATIN SMALL LETTER E WITH CARON}" + "\N{LATIN SMALL LETTER E WITH MACRON}" + "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" "\N{LATIN SMALL LETTER G WITH BREVE}" + "\N{LATIN SMALL LETTER G WITH CEDILLA}" + "\N{LATIN SMALL LETTER I WITH OGONEK}" + "\N{LATIN SMALL LETTER I WITH MACRON}" + "\N{LATIN SMALL LETTER K WITH CEDILLA}" "\N{LATIN SMALL LETTER L WITH ACUTE}" "\N{LATIN SMALL LETTER L WITH CARON}" "\N{LATIN SMALL LETTER L WITH STROKE}" + "\N{LATIN SMALL LETTER L WITH CEDILLA}" "\N{LATIN SMALL LIGATURE OE}" "\N{LATIN SMALL LETTER R WITH ACUTE}" "\N{LATIN SMALL LETTER S WITH ACUTE}" @@ -350,7 +369,13 @@ # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] -""".format(**MOJIBAKE_CATEGORIES), + | + + # Windows-1257 mojibake of characters in the U+2000 range + †+ """.format( + **MOJIBAKE_CATEGORIES + ), re.VERBOSE, ) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 198cbb8..60d1c2b 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -19,6 +19,7 @@ "sloppy-windows-1250", "sloppy-windows-1253", "sloppy-windows-1254", + "sloppy-windows-1257", "iso-8859-2", "macroman", "cp437", @@ -255,18 +256,20 @@ def _build_width_map() -> dict[int, str]: # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding "utf8_first_of_2": ( "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ" + "ĀĒŹĖĢĶĪĻŠŅŌŲŁŚŪŻŽ" "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" ), # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding - "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"), + "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕąįāęēźėģķīļΰαβγδεζηθικλμνξοабвгдежзийклмноп"), # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. # (Other leading bytes correspond only to unassigned codepoints) - "utf8_first_of_4": ("ðóđğπσру"), + "utf8_first_of_4": ("ðóđğπσруš"), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # including a space standing in for 0xA0 "utf8_continuation": ( "\x80-\xbf" "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ØŖÆøŗæ" "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" "–—―‘’‚“”„†‡•…‰‹›€№™" " " @@ -277,6 +280,7 @@ def _build_width_map() -> dict[int, str]: "utf8_continuation_strict": ( "\x80-\xbf" "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ØŖÆøŗæ" "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" "†‡•‰‹›€№™" ), diff --git a/pyproject.toml b/pyproject.toml index f736e0c..b325bad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ ftfy = "ftfy.cli:main" [project.urls] Issues = "https://github.com/rspeer/python-ftfy/issues/" Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md" -Cohost = "https://cohost.org/arborelia" +Blog = "https://posts.arborelia.net" [build-system] requires = ["hatchling"] diff --git a/tests/test_cases.json b/tests/test_cases.json index 342b1d4..29942d2 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -560,6 +560,13 @@ "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", "expect": "pass" }, + { + "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", + "comment": "Should not become a cedilla", + "original": "Connect with Āø on Facebook", + "fixed": "Connect with Āø on Facebook", + "expect": "pass" + }, { "label": "Mostly negative: we only need to fix C1 control characters", "comment": "We should not decode 'é\u0085 ' as '酠'", @@ -927,6 +934,26 @@ "fixed": "NICIODATĂ™", "expect": "pass" }, + { + "label": "Synthetic, negative: Lithuanian word before a trademark sign", + "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", + "original": "TRANSFORMATORIŲ™", + "fixed": "TRANSFORMATORIŲ™", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Norwegian capitalized sentence", + "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", + "original": "HÅØYA ER BLÅØYD", + "fixed": "HÅØYA ER BLÅØYD", + "expect": "pass" + }, + { + "label": "Synthetic, negative: raised eyebrow kaomoji", + "original": "Ō¬o", + "fixed": "Ō¬o", + "expect": "pass" + }, { "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", From ed162d13a6366cfb0d2c3b58820c85d6074261a4 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 19:19:58 -0400 Subject: [PATCH 12/34] add a positive Latvian example --- tests/test_cases.json | 6 + uv.lock | 447 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 452 insertions(+), 1 deletion(-) diff --git a/tests/test_cases.json b/tests/test_cases.json index 29942d2..ddbc8d3 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -754,6 +754,12 @@ "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", "expect": "fail" }, + { + "label": "Latvian UTF-8 / Windows-1257 mojibake", + "original": "Å-veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", + "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", + "expect": "pass" + }, { "label": "Hebrew UTF-8 / Windows-1252 mojibake", "comment": "reported by SuperIRabbit as issue #158", diff --git a/uv.lock b/uv.lock index 373bd6a..6991a31 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,129 @@ version = 1 requires-python = ">=3.9" +[[package]] +name = "alabaster" +version = "0.7.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511 }, +] + +[[package]] +name = "babel" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/ca/824b1195773ce6166d388573fc106ce56d4a805bd7427b624e063596ec58/beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051", size = 581181 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed", size = 147925 }, +] + +[[package]] +name = "certifi" +version = "2024.8.30" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/ee/9b19140fe824b367c04c5e1b369942dd754c4c5462d5674002f75c4dedc1/certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9", size = 168507 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/4f/e1808dc01273379acc506d18f1504eb2d299bd4131743b9fc54d7be4df1e/charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e", size = 106620 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/8b/825cc84cf13a28bfbcba7c416ec22bf85a9584971be15b21dd8300c65b7f/charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6", size = 196363 }, + { url = "https://files.pythonhosted.org/packages/23/81/d7eef6a99e42c77f444fdd7bc894b0ceca6c3a95c51239e74a722039521c/charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b", size = 125639 }, + { url = "https://files.pythonhosted.org/packages/21/67/b4564d81f48042f520c948abac7079356e94b30cb8ffb22e747532cf469d/charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99", size = 120451 }, + { url = "https://files.pythonhosted.org/packages/c2/72/12a7f0943dd71fb5b4e7b55c41327ac0a1663046a868ee4d0d8e9c369b85/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca", size = 140041 }, + { url = "https://files.pythonhosted.org/packages/67/56/fa28c2c3e31217c4c52158537a2cf5d98a6c1e89d31faf476c89391cd16b/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d", size = 150333 }, + { url = "https://files.pythonhosted.org/packages/f9/d2/466a9be1f32d89eb1554cf84073a5ed9262047acee1ab39cbaefc19635d2/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7", size = 142921 }, + { url = "https://files.pythonhosted.org/packages/f8/01/344ec40cf5d85c1da3c1f57566c59e0c9b56bcc5566c08804a95a6cc8257/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3", size = 144785 }, + { url = "https://files.pythonhosted.org/packages/73/8b/2102692cb6d7e9f03b9a33a710e0164cadfce312872e3efc7cfe22ed26b4/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907", size = 146631 }, + { url = "https://files.pythonhosted.org/packages/d8/96/cc2c1b5d994119ce9f088a9a0c3ebd489d360a2eb058e2c8049f27092847/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b", size = 140867 }, + { url = "https://files.pythonhosted.org/packages/c9/27/cde291783715b8ec30a61c810d0120411844bc4c23b50189b81188b273db/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912", size = 149273 }, + { url = "https://files.pythonhosted.org/packages/3a/a4/8633b0fc1a2d1834d5393dafecce4a1cc56727bfd82b4dc18fc92f0d3cc3/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95", size = 152437 }, + { url = "https://files.pythonhosted.org/packages/64/ea/69af161062166b5975ccbb0961fd2384853190c70786f288684490913bf5/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e", size = 150087 }, + { url = "https://files.pythonhosted.org/packages/3b/fd/e60a9d9fd967f4ad5a92810138192f825d77b4fa2a557990fd575a47695b/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe", size = 145142 }, + { url = "https://files.pythonhosted.org/packages/6d/02/8cb0988a1e49ac9ce2eed1e07b77ff118f2923e9ebd0ede41ba85f2dcb04/charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc", size = 94701 }, + { url = "https://files.pythonhosted.org/packages/d6/20/f1d4670a8a723c46be695dff449d86d6092916f9e99c53051954ee33a1bc/charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749", size = 102191 }, + { url = "https://files.pythonhosted.org/packages/9c/61/73589dcc7a719582bf56aae309b6103d2762b526bffe189d635a7fcfd998/charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c", size = 193339 }, + { url = "https://files.pythonhosted.org/packages/77/d5/8c982d58144de49f59571f940e329ad6e8615e1e82ef84584c5eeb5e1d72/charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944", size = 124366 }, + { url = "https://files.pythonhosted.org/packages/bf/19/411a64f01ee971bed3231111b69eb56f9331a769072de479eae7de52296d/charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee", size = 118874 }, + { url = "https://files.pythonhosted.org/packages/4c/92/97509850f0d00e9f14a46bc751daabd0ad7765cff29cdfb66c68b6dad57f/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c", size = 138243 }, + { url = "https://files.pythonhosted.org/packages/e2/29/d227805bff72ed6d6cb1ce08eec707f7cfbd9868044893617eb331f16295/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6", size = 148676 }, + { url = "https://files.pythonhosted.org/packages/13/bc/87c2c9f2c144bedfa62f894c3007cd4530ba4b5351acb10dc786428a50f0/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea", size = 141289 }, + { url = "https://files.pythonhosted.org/packages/eb/5b/6f10bad0f6461fa272bfbbdf5d0023b5fb9bc6217c92bf068fa5a99820f5/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc", size = 142585 }, + { url = "https://files.pythonhosted.org/packages/3b/a0/a68980ab8a1f45a36d9745d35049c1af57d27255eff8c907e3add84cf68f/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5", size = 144408 }, + { url = "https://files.pythonhosted.org/packages/d7/a1/493919799446464ed0299c8eef3c3fad0daf1c3cd48bff9263c731b0d9e2/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594", size = 139076 }, + { url = "https://files.pythonhosted.org/packages/fb/9d/9c13753a5a6e0db4a0a6edb1cef7aee39859177b64e1a1e748a6e3ba62c2/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c", size = 146874 }, + { url = "https://files.pythonhosted.org/packages/75/d2/0ab54463d3410709c09266dfb416d032a08f97fd7d60e94b8c6ef54ae14b/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365", size = 150871 }, + { url = "https://files.pythonhosted.org/packages/8d/c9/27e41d481557be53d51e60750b85aa40eaf52b841946b3cdeff363105737/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129", size = 148546 }, + { url = "https://files.pythonhosted.org/packages/ee/44/4f62042ca8cdc0cabf87c0fc00ae27cd8b53ab68be3605ba6d071f742ad3/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236", size = 143048 }, + { url = "https://files.pythonhosted.org/packages/01/f8/38842422988b795220eb8038745d27a675ce066e2ada79516c118f291f07/charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99", size = 94389 }, + { url = "https://files.pythonhosted.org/packages/0b/6e/b13bd47fa9023b3699e94abf565b5a2f0b0be6e9ddac9812182596ee62e4/charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27", size = 101752 }, + { url = "https://files.pythonhosted.org/packages/d3/0b/4b7a70987abf9b8196845806198975b6aab4ce016632f817ad758a5aa056/charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6", size = 194445 }, + { url = "https://files.pythonhosted.org/packages/50/89/354cc56cf4dd2449715bc9a0f54f3aef3dc700d2d62d1fa5bbea53b13426/charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf", size = 125275 }, + { url = "https://files.pythonhosted.org/packages/fa/44/b730e2a2580110ced837ac083d8ad222343c96bb6b66e9e4e706e4d0b6df/charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db", size = 119020 }, + { url = "https://files.pythonhosted.org/packages/9d/e4/9263b8240ed9472a2ae7ddc3e516e71ef46617fe40eaa51221ccd4ad9a27/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1", size = 139128 }, + { url = "https://files.pythonhosted.org/packages/6b/e3/9f73e779315a54334240353eaea75854a9a690f3f580e4bd85d977cb2204/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03", size = 149277 }, + { url = "https://files.pythonhosted.org/packages/1a/cf/f1f50c2f295312edb8a548d3fa56a5c923b146cd3f24114d5adb7e7be558/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284", size = 142174 }, + { url = "https://files.pythonhosted.org/packages/16/92/92a76dc2ff3a12e69ba94e7e05168d37d0345fa08c87e1fe24d0c2a42223/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15", size = 143838 }, + { url = "https://files.pythonhosted.org/packages/a4/01/2117ff2b1dfc61695daf2babe4a874bca328489afa85952440b59819e9d7/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8", size = 146149 }, + { url = "https://files.pythonhosted.org/packages/f6/9b/93a332b8d25b347f6839ca0a61b7f0287b0930216994e8bf67a75d050255/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2", size = 140043 }, + { url = "https://files.pythonhosted.org/packages/ab/f6/7ac4a01adcdecbc7a7587767c776d53d369b8b971382b91211489535acf0/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719", size = 148229 }, + { url = "https://files.pythonhosted.org/packages/9d/be/5708ad18161dee7dc6a0f7e6cf3a88ea6279c3e8484844c0590e50e803ef/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631", size = 151556 }, + { url = "https://files.pythonhosted.org/packages/5a/bb/3d8bc22bacb9eb89785e83e6723f9888265f3a0de3b9ce724d66bd49884e/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b", size = 149772 }, + { url = "https://files.pythonhosted.org/packages/f7/fa/d3fc622de05a86f30beea5fc4e9ac46aead4731e73fd9055496732bcc0a4/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565", size = 144800 }, + { url = "https://files.pythonhosted.org/packages/9a/65/bdb9bc496d7d190d725e96816e20e2ae3a6fa42a5cac99c3c3d6ff884118/charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7", size = 94836 }, + { url = "https://files.pythonhosted.org/packages/3e/67/7b72b69d25b89c0b3cea583ee372c43aa24df15f0e0f8d3982c57804984b/charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9", size = 102187 }, + { url = "https://files.pythonhosted.org/packages/f3/89/68a4c86f1a0002810a27f12e9a7b22feb198c59b2f05231349fbce5c06f4/charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114", size = 194617 }, + { url = "https://files.pythonhosted.org/packages/4f/cd/8947fe425e2ab0aa57aceb7807af13a0e4162cd21eee42ef5b053447edf5/charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed", size = 125310 }, + { url = "https://files.pythonhosted.org/packages/5b/f0/b5263e8668a4ee9becc2b451ed909e9c27058337fda5b8c49588183c267a/charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250", size = 119126 }, + { url = "https://files.pythonhosted.org/packages/ff/6e/e445afe4f7fda27a533f3234b627b3e515a1b9429bc981c9a5e2aa5d97b6/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920", size = 139342 }, + { url = "https://files.pythonhosted.org/packages/a1/b2/4af9993b532d93270538ad4926c8e37dc29f2111c36f9c629840c57cd9b3/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64", size = 149383 }, + { url = "https://files.pythonhosted.org/packages/fb/6f/4e78c3b97686b871db9be6f31d64e9264e889f8c9d7ab33c771f847f79b7/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23", size = 142214 }, + { url = "https://files.pythonhosted.org/packages/2b/c9/1c8fe3ce05d30c87eff498592c89015b19fade13df42850aafae09e94f35/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc", size = 144104 }, + { url = "https://files.pythonhosted.org/packages/ee/68/efad5dcb306bf37db7db338338e7bb8ebd8cf38ee5bbd5ceaaaa46f257e6/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d", size = 146255 }, + { url = "https://files.pythonhosted.org/packages/0c/75/1ed813c3ffd200b1f3e71121c95da3f79e6d2a96120163443b3ad1057505/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88", size = 140251 }, + { url = "https://files.pythonhosted.org/packages/7d/0d/6f32255c1979653b448d3c709583557a4d24ff97ac4f3a5be156b2e6a210/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90", size = 148474 }, + { url = "https://files.pythonhosted.org/packages/ac/a0/c1b5298de4670d997101fef95b97ac440e8c8d8b4efa5a4d1ef44af82f0d/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b", size = 151849 }, + { url = "https://files.pythonhosted.org/packages/04/4f/b3961ba0c664989ba63e30595a3ed0875d6790ff26671e2aae2fdc28a399/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d", size = 149781 }, + { url = "https://files.pythonhosted.org/packages/d8/90/6af4cd042066a4adad58ae25648a12c09c879efa4849c705719ba1b23d8c/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482", size = 144970 }, + { url = "https://files.pythonhosted.org/packages/cc/67/e5e7e0cbfefc4ca79025238b43cdf8a2037854195b37d6417f3d0895c4c2/charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67", size = 94973 }, + { url = "https://files.pythonhosted.org/packages/65/97/fc9bbc54ee13d33dc54a7fcf17b26368b18505500fc01e228c27b5222d80/charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b", size = 102308 }, + { url = "https://files.pythonhosted.org/packages/54/2f/28659eee7f5d003e0f5a3b572765bf76d6e0fe6601ab1f1b1dd4cba7e4f1/charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa", size = 196326 }, + { url = "https://files.pythonhosted.org/packages/d1/18/92869d5c0057baa973a3ee2af71573be7b084b3c3d428fe6463ce71167f8/charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a", size = 125614 }, + { url = "https://files.pythonhosted.org/packages/d6/27/327904c5a54a7796bb9f36810ec4173d2df5d88b401d2b95ef53111d214e/charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0", size = 120450 }, + { url = "https://files.pythonhosted.org/packages/a4/23/65af317914a0308495133b2d654cf67b11bbd6ca16637c4e8a38f80a5a69/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a", size = 140135 }, + { url = "https://files.pythonhosted.org/packages/f2/41/6190102ad521a8aa888519bb014a74251ac4586cde9b38e790901684f9ab/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242", size = 150413 }, + { url = "https://files.pythonhosted.org/packages/7b/ab/f47b0159a69eab9bd915591106859f49670c75f9a19082505ff16f50efc0/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b", size = 142992 }, + { url = "https://files.pythonhosted.org/packages/28/89/60f51ad71f63aaaa7e51a2a2ad37919985a341a1d267070f212cdf6c2d22/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62", size = 144871 }, + { url = "https://files.pythonhosted.org/packages/0c/48/0050550275fea585a6e24460b42465020b53375017d8596c96be57bfabca/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0", size = 146756 }, + { url = "https://files.pythonhosted.org/packages/dc/b5/47f8ee91455946f745e6c9ddbb0f8f50314d2416dd922b213e7d5551ad09/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd", size = 141034 }, + { url = "https://files.pythonhosted.org/packages/84/79/5c731059ebab43e80bf61fa51666b9b18167974b82004f18c76378ed31a3/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be", size = 149434 }, + { url = "https://files.pythonhosted.org/packages/ca/f3/0719cd09fc4dc42066f239cb3c48ced17fc3316afca3e2a30a4756fe49ab/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d", size = 152443 }, + { url = "https://files.pythonhosted.org/packages/f7/0e/c6357297f1157c8e8227ff337e93fd0a90e498e3d6ab96b2782204ecae48/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3", size = 150294 }, + { url = "https://files.pythonhosted.org/packages/54/9a/acfa96dc4ea8c928040b15822b59d0863d6e1757fba8bd7de3dc4f761c13/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742", size = 145314 }, + { url = "https://files.pythonhosted.org/packages/73/1c/b10a63032eaebb8d7bcb8544f12f063f41f5f463778ac61da15d9985e8b6/charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2", size = 94724 }, + { url = "https://files.pythonhosted.org/packages/c5/77/3a78bf28bfaa0863f9cfef278dbeadf55efe064eafff8c7c424ae3c4c1bf/charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca", size = 102159 }, + { url = "https://files.pythonhosted.org/packages/bf/9b/08c0432272d77b04803958a4598a51e2a4b51c06640af8b8f0f908c18bf2/charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079", size = 49446 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -10,6 +133,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -29,14 +161,67 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "furo" }, { name = "pytest" }, + { name = "ruff" }, + { name = "sphinx" }, ] [package.metadata] requires-dist = [{ name = "wcwidth" }] [package.metadata.requires-dev] -dev = [{ name = "pytest" }] +dev = [ + { name = "furo", specifier = ">=2024.7.18" }, + { name = "pytest", specifier = ">=8.3.2,<9" }, + { name = "ruff" }, + { name = "sphinx", specifier = ">=7,<8" }, +] + +[[package]] +name = "furo" +version = "2024.8.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "pygments" }, + { name = "sphinx" }, + { name = "sphinx-basic-ng" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a0/e2/d351d69a9a9e4badb4a5be062c2d0e87bd9e6c23b5e57337fef14bef34c8/furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01", size = 1661506 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/48/e791a7ed487dbb9729ef32bb5d1af16693d8925f4366befef54119b2e576/furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c", size = 341333 }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, +] + +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769 }, +] + +[[package]] +name = "importlib-metadata" +version = "8.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, +] [[package]] name = "iniconfig" @@ -47,6 +232,86 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] +[[package]] +name = "jinja2" +version = "3.1.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/55/39036716d19cab0747a5020fc7e907f362fbf48c984b14e62127f7e68e5d/jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369", size = 240245 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d", size = 133271 }, +] + +[[package]] +name = "markupsafe" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/d2/38ff920762f2247c3af5cbbbbc40756f575d9692d381d7c520f45deb9b8f/markupsafe-3.0.1.tar.gz", hash = "sha256:3e683ee4f5d0fa2dde4db77ed8dd8a876686e3fc417655c2ece9a90576905344", size = 20249 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/a2/0482d1a157f5f10f72fc4fe8c3be9ffa3651c1f7a12b60a3ab71b2635e13/MarkupSafe-3.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1", size = 14391 }, + { url = "https://files.pythonhosted.org/packages/3b/25/5ea6500d200fd2dc3ea25c765f69dea0a1a8d42ec80a38cd896ad47cb85d/MarkupSafe-3.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a", size = 12414 }, + { url = "https://files.pythonhosted.org/packages/92/41/cf5397dd6bb18895d148aa402cafa71018f2ffc5f6e9d6e90d85b523c741/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67c519635a4f64e495c50e3107d9b4075aec33634272b5db1cde839e07367589", size = 21787 }, + { url = "https://files.pythonhosted.org/packages/2e/0d/5d91ef2b4f30afa87483a3a7c108c777d144b1c42d7113459296a8a2bfa0/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48488d999ed50ba8d38c581d67e496f955821dc183883550a6fbc7f1aefdc170", size = 20954 }, + { url = "https://files.pythonhosted.org/packages/f6/de/12a4110c2c7c7b502fe0e6f911367726dbb7a37e03e207495135d064bb48/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f31ae06f1328595d762c9a2bf29dafd8621c7d3adc130cbb46278079758779ca", size = 21086 }, + { url = "https://files.pythonhosted.org/packages/96/55/59389babc6e8ed206849a9958de9da7c23f3a75d294f46e99624fa38fb79/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80fcbf3add8790caddfab6764bde258b5d09aefbe9169c183f88a7410f0f6dea", size = 21685 }, + { url = "https://files.pythonhosted.org/packages/3d/cb/cbad5f093e12cd79ceea3e2957ba5bd4c2706810f333d0a3422ab2aef358/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3341c043c37d78cc5ae6e3e305e988532b072329639007fd408a476642a89fd6", size = 21348 }, + { url = "https://files.pythonhosted.org/packages/8e/70/e19c4f39d68a52406012ee118667b57efb0bbe6e950be21187cd7a1b4b80/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cb53e2a99df28eee3b5f4fea166020d3ef9116fdc5764bc5117486e6d1211b25", size = 21098 }, + { url = "https://files.pythonhosted.org/packages/30/95/ca809c01624428d427e9b3a4500f9068eca941e0c520328954ce84ad966a/MarkupSafe-3.0.1-cp310-cp310-win32.whl", hash = "sha256:db15ce28e1e127a0013dfb8ac243a8e392db8c61eae113337536edb28bdc1f97", size = 15075 }, + { url = "https://files.pythonhosted.org/packages/23/41/decb99ab07793656821a86f827a394700ce28402ebb02dc6d003210d9859/MarkupSafe-3.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:4ffaaac913c3f7345579db4f33b0020db693f302ca5137f106060316761beea9", size = 15535 }, + { url = "https://files.pythonhosted.org/packages/ce/af/2f5d88a7fc7226bd34c6e15f6061246ad8cff979da9f19d11bdd0addd8e2/MarkupSafe-3.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:26627785a54a947f6d7336ce5963569b5d75614619e75193bdb4e06e21d447ad", size = 14387 }, + { url = "https://files.pythonhosted.org/packages/8d/43/fd588ef5d192308c5e05974bac659bf6ae29c202b7ea2c4194bcf01eacee/MarkupSafe-3.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b954093679d5750495725ea6f88409946d69cfb25ea7b4c846eef5044194f583", size = 12410 }, + { url = "https://files.pythonhosted.org/packages/58/26/78f161d602fb03804118905e5faacafc0ec592bbad71aaee62537529813a/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:973a371a55ce9ed333a3a0f8e0bcfae9e0d637711534bcb11e130af2ab9334e7", size = 24006 }, + { url = "https://files.pythonhosted.org/packages/ae/1d/7d5ec8bcfd9c2db235d720fa51d818b7e2abc45250ce5f53dd6cb60409ca/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:244dbe463d5fb6d7ce161301a03a6fe744dac9072328ba9fc82289238582697b", size = 23303 }, + { url = "https://files.pythonhosted.org/packages/26/ce/703ca3b03a709e3bd1fbffa407789e56b9fa664456538092617dd665fc1d/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d98e66a24497637dd31ccab090b34392dddb1f2f811c4b4cd80c230205c074a3", size = 23205 }, + { url = "https://files.pythonhosted.org/packages/88/60/40be0493decabc2344b12d3a709fd6ccdd15a5ebaee1e8d878315d107ad3/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ad91738f14eb8da0ff82f2acd0098b6257621410dcbd4df20aaa5b4233d75a50", size = 23684 }, + { url = "https://files.pythonhosted.org/packages/6d/f8/8fd52a66e8f62a9add62b4a0b5a3ab4092027437f2ef027f812d94ae91cf/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7044312a928a66a4c2a22644147bc61a199c1709712069a344a3fb5cfcf16915", size = 23472 }, + { url = "https://files.pythonhosted.org/packages/d4/0b/998b17b9e06ea45ad1646fea586f1b83d02dfdb14d47dd2fd81fba5a08c9/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a4792d3b3a6dfafefdf8e937f14906a51bd27025a36f4b188728a73382231d91", size = 23388 }, + { url = "https://files.pythonhosted.org/packages/5a/57/b6b7aa23b2e26d68d601718f8ce3161fbdaf967b31752c7dec52bef828c9/MarkupSafe-3.0.1-cp311-cp311-win32.whl", hash = "sha256:fa7d686ed9883f3d664d39d5a8e74d3c5f63e603c2e3ff0abcba23eac6542635", size = 15106 }, + { url = "https://files.pythonhosted.org/packages/fc/b5/20cb1d714596acb553c810009c8004c809823947da63e13c19a7decfcb6c/MarkupSafe-3.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:9ba25a71ebf05b9bb0e2ae99f8bc08a07ee8e98c612175087112656ca0f5c8bf", size = 15542 }, + { url = "https://files.pythonhosted.org/packages/45/6d/72ed58d42a12bd9fc288dbff6dd8d03ea973a232ac0538d7f88d105b5251/MarkupSafe-3.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8ae369e84466aa70f3154ee23c1451fda10a8ee1b63923ce76667e3077f2b0c4", size = 14322 }, + { url = "https://files.pythonhosted.org/packages/86/f5/241238f89cdd6461ac9f521af8389f9a48fab97e4f315c69e9e0d52bc919/MarkupSafe-3.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40f1e10d51c92859765522cbd79c5c8989f40f0419614bcdc5015e7b6bf97fc5", size = 12380 }, + { url = "https://files.pythonhosted.org/packages/27/94/79751928bca5841416d8ca02e22198672e021d5c7120338e2a6e3771f8fc/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a4cb365cb49b750bdb60b846b0c0bc49ed62e59a76635095a179d440540c346", size = 24099 }, + { url = "https://files.pythonhosted.org/packages/10/6e/1b8070bbfc467429c7983cd5ffd4ec57e1d501763d974c7caaa0a9a79f4c/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee3941769bd2522fe39222206f6dd97ae83c442a94c90f2b7a25d847d40f4729", size = 23249 }, + { url = "https://files.pythonhosted.org/packages/66/50/9389ae6cdff78d7481a2a2641830b5eb1d1f62177550e73355a810a889c9/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62fada2c942702ef8952754abfc1a9f7658a4d5460fabe95ac7ec2cbe0d02abc", size = 23149 }, + { url = "https://files.pythonhosted.org/packages/16/02/5dddff5366fde47133186efb847fa88bddef85914bbe623e25cfeccb3517/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c2d64fdba74ad16138300815cfdc6ab2f4647e23ced81f59e940d7d4a1469d9", size = 23864 }, + { url = "https://files.pythonhosted.org/packages/f3/f1/700ee6655561cfda986e03f7afc309e3738918551afa7dedd99225586227/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fb532dd9900381d2e8f48172ddc5a59db4c445a11b9fab40b3b786da40d3b56b", size = 23440 }, + { url = "https://files.pythonhosted.org/packages/fb/3e/d26623ac7f16709823b4c80e0b4a1c9196eeb46182a6c1d47b5e0c8434f4/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0f84af7e813784feb4d5e4ff7db633aba6c8ca64a833f61d8e4eade234ef0c38", size = 23610 }, + { url = "https://files.pythonhosted.org/packages/51/04/1f8da0810c39cb9fcff96b6baed62272c97065e9cf11471965a161439e20/MarkupSafe-3.0.1-cp312-cp312-win32.whl", hash = "sha256:cbf445eb5628981a80f54087f9acdbf84f9b7d862756110d172993b9a5ae81aa", size = 15113 }, + { url = "https://files.pythonhosted.org/packages/eb/24/a36dc37365bdd358b1e583cc40475593e36ab02cb7da6b3d0b9c05b0da7a/MarkupSafe-3.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a10860e00ded1dd0a65b83e717af28845bb7bd16d8ace40fe5531491de76b79f", size = 15611 }, + { url = "https://files.pythonhosted.org/packages/b1/60/4572a8aa1beccbc24b133aa0670781a5d2697f4fa3fecf0a87b46383174b/MarkupSafe-3.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e81c52638315ff4ac1b533d427f50bc0afc746deb949210bc85f05d4f15fd772", size = 14325 }, + { url = "https://files.pythonhosted.org/packages/38/42/849915b99a765ec104bfd07ee933de5fc9c58fa9570efa7db81717f495d8/MarkupSafe-3.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:312387403cd40699ab91d50735ea7a507b788091c416dd007eac54434aee51da", size = 12373 }, + { url = "https://files.pythonhosted.org/packages/ef/82/4caaebd963c6d60b28e4445f38841d24f8b49bc10594a09956c9d73bfc08/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ae99f31f47d849758a687102afdd05bd3d3ff7dbab0a8f1587981b58a76152a", size = 24059 }, + { url = "https://files.pythonhosted.org/packages/20/15/6b319be2f79fcfa3173f479d69f4e950b5c9b642db4f22cf73ae5ade745f/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c97ff7fedf56d86bae92fa0a646ce1a0ec7509a7578e1ed238731ba13aabcd1c", size = 23211 }, + { url = "https://files.pythonhosted.org/packages/9d/3f/8963bdf4962feb2154475acb7dc350f04217b5e0be7763a39b432291e229/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7420ceda262dbb4b8d839a4ec63d61c261e4e77677ed7c66c99f4e7cb5030dd", size = 23095 }, + { url = "https://files.pythonhosted.org/packages/af/93/f770bc70953d32de0c6ce4bcb76271512123a1ead91aaef625a020c5bfaf/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45d42d132cff577c92bfba536aefcfea7e26efb975bd455db4e6602f5c9f45e7", size = 23901 }, + { url = "https://files.pythonhosted.org/packages/11/92/1e5a33aa0a1190161238628fb68eb1bc5e67b56a5c89f0636328704b463a/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c8817557d0de9349109acb38b9dd570b03cc5014e8aabf1cbddc6e81005becd", size = 23463 }, + { url = "https://files.pythonhosted.org/packages/0d/fe/657efdfe385d2a3a701f2c4fcc9577c63c438aeefdd642d0d956c4ecd225/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a54c43d3ec4cf2a39f4387ad044221c66a376e58c0d0e971d47c475ba79c6b5", size = 23569 }, + { url = "https://files.pythonhosted.org/packages/cf/24/587dea40304046ace60f846cedaebc0d33d967a3ce46c11395a10e7a78ba/MarkupSafe-3.0.1-cp313-cp313-win32.whl", hash = "sha256:c91b394f7601438ff79a4b93d16be92f216adb57d813a78be4446fe0f6bc2d8c", size = 15117 }, + { url = "https://files.pythonhosted.org/packages/32/8f/d8961d633f26a011b4fe054f3bfff52f673423b8c431553268741dfb089e/MarkupSafe-3.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe32482b37b4b00c7a52a07211b479653b7fe4f22b2e481b9a9b099d8a430f2f", size = 15613 }, + { url = "https://files.pythonhosted.org/packages/9e/93/d6367ffbcd0c5c371370767f768eaa32af60bc411245b8517e383c6a2b12/MarkupSafe-3.0.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:17b2aea42a7280db02ac644db1d634ad47dcc96faf38ab304fe26ba2680d359a", size = 14563 }, + { url = "https://files.pythonhosted.org/packages/4a/37/f813c3835747dec08fe19ac9b9eced01fdf93a4b3e626521675dc7f423a9/MarkupSafe-3.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:852dc840f6d7c985603e60b5deaae1d89c56cb038b577f6b5b8c808c97580f1d", size = 12505 }, + { url = "https://files.pythonhosted.org/packages/72/bf/800b4d1580298ca91ccd6c95915bbd147142dad1b8cf91d57b93b28670dd/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0778de17cff1acaeccc3ff30cd99a3fd5c50fc58ad3d6c0e0c4c58092b859396", size = 25358 }, + { url = "https://files.pythonhosted.org/packages/fd/78/26e209abc8f0a379f031f0acc151231974e5b153d7eda5759d17d8f329f2/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:800100d45176652ded796134277ecb13640c1a537cad3b8b53da45aa96330453", size = 23797 }, + { url = "https://files.pythonhosted.org/packages/09/e1/918496a9390891756efee818880e71c1bbaf587f4dc8ede3f3852357310a/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d06b24c686a34c86c8c1fba923181eae6b10565e4d80bdd7bc1c8e2f11247aa4", size = 23743 }, + { url = "https://files.pythonhosted.org/packages/cd/c6/26f576cd58d6c2decd9045e4e3f3c5dbc01ea6cb710916e7bbb6ebd95b6b/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:33d1c36b90e570ba7785dacd1faaf091203d9942bc036118fab8110a401eb1a8", size = 25076 }, + { url = "https://files.pythonhosted.org/packages/b5/fa/10b24fb3b0e15fe5389dc88ecc6226ede08297e0ba7130610efbe0cdfb27/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:beeebf760a9c1f4c07ef6a53465e8cfa776ea6a2021eda0d0417ec41043fe984", size = 24037 }, + { url = "https://files.pythonhosted.org/packages/c8/81/4b3f5537d9f6cc4f5c80d6c4b78af9a5247fd37b5aba95807b2cbc336b9a/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bbde71a705f8e9e4c3e9e33db69341d040c827c7afa6789b14c6e16776074f5a", size = 24015 }, + { url = "https://files.pythonhosted.org/packages/5f/07/8e8dcecd53216c5e01a51e84c32a2bce166690ed19c184774b38cd41921d/MarkupSafe-3.0.1-cp313-cp313t-win32.whl", hash = "sha256:82b5dba6eb1bcc29cc305a18a3c5365d2af06ee71b123216416f7e20d2a84e5b", size = 15213 }, + { url = "https://files.pythonhosted.org/packages/0d/87/4c364e0f109eea2402079abecbe33fef4f347b551a11423d1f4e187ea497/MarkupSafe-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:730d86af59e0e43ce277bb83970530dd223bf7f2a838e086b50affa6ec5f9295", size = 15741 }, + { url = "https://files.pythonhosted.org/packages/6f/4f/420741fb39fa3d40396fb1731a1ca78e6f9fbb225dcf15e5185b1fa954bc/MarkupSafe-3.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4935dd7883f1d50e2ffecca0aa33dc1946a94c8f3fdafb8df5c330e48f71b132", size = 14376 }, + { url = "https://files.pythonhosted.org/packages/91/71/0c4782b9ce7fb68b140b94e1eb9d2b6292990bda91dc3d3b5a34e8bd41f3/MarkupSafe-3.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e9393357f19954248b00bed7c56f29a25c930593a77630c719653d51e7669c2a", size = 12408 }, + { url = "https://files.pythonhosted.org/packages/3e/3c/cbf30bf7ac1da2e013e3d338e1582db85fc3b27bf9f8863137423ad4b0b6/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40621d60d0e58aa573b68ac5e2d6b20d44392878e0bfc159012a5787c4e35bc8", size = 21654 }, + { url = "https://files.pythonhosted.org/packages/0b/28/229e797b8727427845b79cbd58019f598e478f974730fa705fa23904b18e/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f94190df587738280d544971500b9cafc9b950d32efcb1fba9ac10d84e6aa4e6", size = 20817 }, + { url = "https://files.pythonhosted.org/packages/e8/b4/1121f3b2614de93cbb3deec7f44df283df44c2258ea9368bb1302b4a0b45/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6a387d61fe41cdf7ea95b38e9af11cfb1a63499af2759444b99185c4ab33f5b", size = 20956 }, + { url = "https://files.pythonhosted.org/packages/a8/8b/b4d57bafca01c8b1e1fbb037660869fa4f6725983c4105a02bd1242f0066/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8ad4ad1429cd4f315f32ef263c1342166695fad76c100c5d979c45d5570ed58b", size = 21548 }, + { url = "https://files.pythonhosted.org/packages/83/87/04806f7096ba1d4f1b8c61f35c1d7c0b507c6a3cf7ed495393bf97eb5af7/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e24bfe89c6ac4c31792793ad9f861b8f6dc4546ac6dc8f1c9083c7c4f2b335cd", size = 21222 }, + { url = "https://files.pythonhosted.org/packages/e9/96/1ecb2bb5ee7298e628cff95833beba7da6a774df7fe890a6d2f0ec460590/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2a4b34a8d14649315c4bc26bbfa352663eb51d146e35eef231dd739d54a5430a", size = 20952 }, + { url = "https://files.pythonhosted.org/packages/fd/70/b937a12df7bbff14e1ca3385929f464c7af2ca72c8183c95dad26c3bf754/MarkupSafe-3.0.1-cp39-cp39-win32.whl", hash = "sha256:242d6860f1fd9191aef5fae22b51c5c19767f93fb9ead4d21924e0bcb17619d8", size = 15075 }, + { url = "https://files.pythonhosted.org/packages/e3/c4/262fac0328552da9a75a7786d7c0f43adaba4afb5f295979d33fa0f324c7/MarkupSafe-3.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:93e8248d650e7e9d49e8251f883eed60ecbc0e8ffd6349e18550925e31bd029b", size = 15527 }, +] + [[package]] name = "packaging" version = "24.1" @@ -65,6 +330,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "pygments" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", size = 4891905 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 }, +] + [[package]] name = "pytest" version = "8.3.3" @@ -82,6 +356,159 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, ] +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, +] + +[[package]] +name = "ruff" +version = "0.6.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/0d/6148a48dab5662ca1d5a93b7c0d13c03abd3cc7e2f35db08410e47cef15d/ruff-0.6.9.tar.gz", hash = "sha256:b076ef717a8e5bc819514ee1d602bbdca5b4420ae13a9cf61a0c0a4f53a2baa2", size = 3095355 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/8f/f7a0a0ef1818662efb32ed6df16078c95da7a0a3248d64c2410c1e27799f/ruff-0.6.9-py3-none-linux_armv6l.whl", hash = "sha256:064df58d84ccc0ac0fcd63bc3090b251d90e2a372558c0f057c3f75ed73e1ccd", size = 10440526 }, + { url = "https://files.pythonhosted.org/packages/8b/69/b179a5faf936a9e2ab45bb412a668e4661eded964ccfa19d533f29463ef6/ruff-0.6.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:140d4b5c9f5fc7a7b074908a78ab8d384dd7f6510402267bc76c37195c02a7ec", size = 10034612 }, + { url = "https://files.pythonhosted.org/packages/c7/ef/fd1b4be979c579d191eeac37b5cfc0ec906de72c8bcd8595e2c81bb700c1/ruff-0.6.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:53fd8ca5e82bdee8da7f506d7b03a261f24cd43d090ea9db9a1dc59d9313914c", size = 9706197 }, + { url = "https://files.pythonhosted.org/packages/29/61/b376d775deb5851cb48d893c568b511a6d3625ef2c129ad5698b64fb523c/ruff-0.6.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645d7d8761f915e48a00d4ecc3686969761df69fb561dd914a773c1a8266e14e", size = 10751855 }, + { url = "https://files.pythonhosted.org/packages/13/d7/def9e5f446d75b9a9c19b24231a3a658c075d79163b08582e56fa5dcfa38/ruff-0.6.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eae02b700763e3847595b9d2891488989cac00214da7f845f4bcf2989007d577", size = 10200889 }, + { url = "https://files.pythonhosted.org/packages/6c/d6/7f34160818bcb6e84ce293a5966cba368d9112ff0289b273fbb689046047/ruff-0.6.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d5ccc9e58112441de8ad4b29dcb7a86dc25c5f770e3c06a9d57e0e5eba48829", size = 11038678 }, + { url = "https://files.pythonhosted.org/packages/13/34/a40ff8ae62fb1b26fb8e6fa7e64bc0e0a834b47317880de22edd6bfb54fb/ruff-0.6.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:417b81aa1c9b60b2f8edc463c58363075412866ae4e2b9ab0f690dc1e87ac1b5", size = 11808682 }, + { url = "https://files.pythonhosted.org/packages/2e/6d/25a4386ae4009fc798bd10ba48c942d1b0b3e459b5403028f1214b6dd161/ruff-0.6.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c866b631f5fbce896a74a6e4383407ba7507b815ccc52bcedabb6810fdb3ef7", size = 11330446 }, + { url = "https://files.pythonhosted.org/packages/f7/f6/bdf891a9200d692c94ebcd06ae5a2fa5894e522f2c66c2a12dd5d8cb2654/ruff-0.6.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b118afbb3202f5911486ad52da86d1d52305b59e7ef2031cea3425142b97d6f", size = 12483048 }, + { url = "https://files.pythonhosted.org/packages/a7/86/96f4252f41840e325b3fa6c48297e661abb9f564bd7dcc0572398c8daa42/ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a67267654edc23c97335586774790cde402fb6bbdb3c2314f1fc087dee320bfa", size = 10936855 }, + { url = "https://files.pythonhosted.org/packages/45/87/801a52d26c8dbf73424238e9908b9ceac430d903c8ef35eab1b44fcfa2bd/ruff-0.6.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3ef0cc774b00fec123f635ce5c547dac263f6ee9fb9cc83437c5904183b55ceb", size = 10713007 }, + { url = "https://files.pythonhosted.org/packages/be/27/6f7161d90320a389695e32b6ebdbfbedde28ccbf52451e4b723d7ce744ad/ruff-0.6.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:12edd2af0c60fa61ff31cefb90aef4288ac4d372b4962c2864aeea3a1a2460c0", size = 10274594 }, + { url = "https://files.pythonhosted.org/packages/00/52/dc311775e7b5f5b19831563cb1572ecce63e62681bccc609867711fae317/ruff-0.6.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:55bb01caeaf3a60b2b2bba07308a02fca6ab56233302406ed5245180a05c5625", size = 10608024 }, + { url = "https://files.pythonhosted.org/packages/98/b6/be0a1ddcbac65a30c985cf7224c4fce786ba2c51e7efeb5178fe410ed3cf/ruff-0.6.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:925d26471fa24b0ce5a6cdfab1bb526fb4159952385f386bdcc643813d472039", size = 10982085 }, + { url = "https://files.pythonhosted.org/packages/bb/a4/c84bc13d0b573cf7bb7d17b16d6d29f84267c92d79b2f478d4ce322e8e72/ruff-0.6.9-py3-none-win32.whl", hash = "sha256:eb61ec9bdb2506cffd492e05ac40e5bc6284873aceb605503d8494180d6fc84d", size = 8522088 }, + { url = "https://files.pythonhosted.org/packages/74/be/fc352bd8ca40daae8740b54c1c3e905a7efe470d420a268cd62150248c91/ruff-0.6.9-py3-none-win_amd64.whl", hash = "sha256:785d31851c1ae91f45b3d8fe23b8ae4b5170089021fbb42402d811135f0b7117", size = 9359275 }, + { url = "https://files.pythonhosted.org/packages/3e/14/fd026bc74ded05e2351681545a5f626e78ef831f8edce064d61acd2e6ec7/ruff-0.6.9-py3-none-win_arm64.whl", hash = "sha256:a9641e31476d601f83cd602608739a0840e348bda93fec9f1ee816f8b6798b93", size = 8679879 }, +] + +[[package]] +name = "snowballstemmer" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/7b/af302bebf22c749c56c9c3e8ae13190b5b5db37a33d9068652e8f73b7089/snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", size = 86699 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/dc/c02e01294f7265e63a7315fe086dd1df7dacb9f840a804da846b96d01b96/snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a", size = 93002 }, +] + +[[package]] +name = "soupsieve" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, +] + +[[package]] +name = "sphinx" +version = "7.4.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "alabaster" }, + { name = "babel" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "docutils" }, + { name = "imagesize" }, + { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, + { name = "jinja2" }, + { name = "packaging" }, + { name = "pygments" }, + { name = "requests" }, + { name = "snowballstemmer" }, + { name = "sphinxcontrib-applehelp" }, + { name = "sphinxcontrib-devhelp" }, + { name = "sphinxcontrib-htmlhelp" }, + { name = "sphinxcontrib-jsmath" }, + { name = "sphinxcontrib-qthelp" }, + { name = "sphinxcontrib-serializinghtml" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/ef/153f6803c5d5f8917dbb7f7fcf6d34a871ede3296fa89c2c703f5f8a6c8e/sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239", size = 3401624 }, +] + +[[package]] +name = "sphinx-basic-ng" +version = "1.0.0b2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/0b/a866924ded68efec7a1759587a4e478aec7559d8165fac8b2ad1c0e774d6/sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9", size = 20736 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/dd/018ce05c532a22007ac58d4f45232514cd9d6dd0ee1dc374e309db830983/sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b", size = 22496 }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, +] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, +] + [[package]] name = "tomli" version = "2.0.2" @@ -91,6 +518,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237 }, ] +[[package]] +name = "urllib3" +version = "2.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, +] + [[package]] name = "wcwidth" version = "0.2.13" @@ -99,3 +535,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc wheels = [ { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, ] + +[[package]] +name = "zipp" +version = "3.20.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, +] From a0a0f4567df6e3316b9b8ac6d14481025b009856 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 19:22:17 -0400 Subject: [PATCH 13/34] fix Latvian test case --- tests/test_cases.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cases.json b/tests/test_cases.json index ddbc8d3..3204d94 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -756,7 +756,7 @@ }, { "label": "Latvian UTF-8 / Windows-1257 mojibake", - "original": "Å-veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", + "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", "expect": "pass" }, From 18f6dbf0025da578f1d13a2f2646941d6706265f Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 19:39:23 -0400 Subject: [PATCH 14/34] notice mid-word start punctuation (fixes #152) --- ftfy/badness.py | 6 +++++- tests/test_cases.json | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ftfy/badness.py b/ftfy/badness.py index 52ed7ef..ad93853 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -302,13 +302,17 @@ | [{box}] [{end_punctuation}] | - [{lower_accented}{upper_accented}] [{end_punctuation}] \w + [{lower_accented}{upper_accented}] [{start_punctuation}{end_punctuation}] \w | # The ligature œ when not followed by an unaccented Latin letter [Œœ][^A-Za-z] | + # Degree signs after capital letters + [{upper_accented}]° + | + # Common Windows-1252 2-character mojibake that isn't covered by the cases above [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] | diff --git a/tests/test_cases.json b/tests/test_cases.json index 3204d94..7897483 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -166,6 +166,18 @@ "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", "expect": "pass" }, + { + "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", + "original": "İstanbul", + "fixed": "İstanbul", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", + "original": "RÄ«ga", + "fixed": "Rīga", + "expect": "pass" + }, { "label": "UTF-8 / Windows-1251 mixed up twice in Russian", "original": "приятности. РІСњВ¤", From 3d04558ce8559cb91c82de3d4acd83129d14842d Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 19:46:32 -0400 Subject: [PATCH 15/34] expand heuristic to cover issue #188 --- ftfy/badness.py | 2 +- tests/test_cases.json | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ftfy/badness.py b/ftfy/badness.py index ad93853..6ac4a76 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -314,7 +314,7 @@ | # Common Windows-1252 2-character mojibake that isn't covered by the cases above - [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] + [ÂÃÎÐ][€œŠš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] | × [²³] | diff --git a/tests/test_cases.json b/tests/test_cases.json index 7897483..83f1d4b 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -172,6 +172,12 @@ "fixed": "İstanbul", "expect": "pass" }, + { + "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", + "original": "RUF MICH ZURÜCK", + "fixed": "RUF MICH ZURÜCK", + "expect": "pass" + }, { "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", "original": "RÄ«ga", From d15ac27a2794fd87c8d66285d8b2b755a72ef6e5 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 9 Oct 2024 22:24:06 -0400 Subject: [PATCH 16/34] update docs for windows-1257 --- docs/encodings.rst | 13 +++++++------ ftfy/chardata.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/encodings.rst b/docs/encodings.rst index 13a892f..69fa91d 100644 --- a/docs/encodings.rst +++ b/docs/encodings.rst @@ -6,11 +6,12 @@ ftfy can't fix all possible mix-ups. Its goal is to cover the most common encodi ftfy can understand text that was decoded as any of these single-byte encodings: - Latin-1 (ISO-8859-1) -- Windows-1252 (cp1252 -- used in Microsoft products) -- Windows-1251 (cp1251 -- the Russian version of cp1252) -- Windows-1250 (cp1250 -- the Eastern European version of cp1252) -- Windows-1253 (cp1253 -- the Greek version of cp1252) -- Windows-1254 (cp1254 -- the Turkish version of cp1252) +- Windows-1250 (cp1250 -- used in Microsoft products in Eastern Europe) +- Windows-1251 (cp1251 -- used in Microsoft products in Russia) +- Windows-1252 (cp1252 -- used in Microsoft products in Western Europe and the Americas) +- Windows-1253 (cp1253 -- used in Microsoft products in Greece) +- Windows-1254 (cp1254 -- used in Microsoft products in Türkiye) +- Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries) - ISO-8859-2 (which is not quite the same as Windows-1250) - MacRoman (used on Mac OS 9 and earlier) - cp437 (used in MS-DOS and some versions of the Windows command prompt) @@ -26,6 +27,6 @@ However, ftfy cannot understand other mixups between single-byte encodings, beca We also can't handle the legacy encodings used for Chinese, Japanese, and Korean, such as ``shift-jis`` and ``gb18030``. See `issue #34`_ for why this is so hard. -.. _`issue #34`: https://github.com/LuminosoInsight/python-ftfy/issues/34 +.. _`issue #34`: https://github.com/rspeer/python-ftfy/issues/34 Remember that the input to ftfy is Unicode, so it handles actual CJK *text* just fine. It just can't discover that a CJK *encoding* introduced mojibake into the text. diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 60d1c2b..dd33869 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -313,6 +313,6 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ -""".format(**UTF8_CLUES), + """.format(**UTF8_CLUES), re.VERBOSE, ) From 76fa9e537a8b80848d2f756eec5b3451d2ed06f5 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 12:51:50 -0400 Subject: [PATCH 17/34] try to configure readthedocs for uv --- .readthedocs.yaml | 17 ++++++++--------- docs/conf.py | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index b6d16ee..6fe1099 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,15 +10,14 @@ build: os: ubuntu-24.04 tools: python: "3.11" - jobs: - post_create_environment: - # Install poetry - # https://python-poetry.org/docs/#installing-manually - - python -m pip install poetry - post_install: - # Install only dependencies - # https://python-poetry.org/docs/managing-dependencies/#dependency-groups - - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install + commands: + - asdf plugin add uv + - asdf install uv latest + - asdf global uv latest + - uv venv + - uv sync + - .venv/bin/python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py diff --git a/docs/conf.py b/docs/conf.py index 1bff5e9..822883e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,7 +48,7 @@ # The short X.Y version. version = "6.3" # The full version, including alpha/beta/rc tags. -release = "6.3.0" +release = "6.3.0rc1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 6760b44c9959a92d78e63e9671d7a6a5dc325000 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 13:55:21 -0400 Subject: [PATCH 18/34] fix link --- docs/index.rst | 2 +- scripts/char_data_table.py | 77 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 scripts/char_data_table.py diff --git a/docs/index.rst b/docs/index.rst index a62feed..1ba7ed0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,7 +6,7 @@ ftfy: fixes text for you “Assume all external input is the result of (a series of) bugs.” — `RFC 9225`_: Software Defects Considered Harmful -.. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html#confirmed +.. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html **ftfy** fixes Unicode that's broken in various ways. diff --git a/scripts/char_data_table.py b/scripts/char_data_table.py new file mode 100644 index 0000000..cc174a4 --- /dev/null +++ b/scripts/char_data_table.py @@ -0,0 +1,77 @@ +from ftfy.chardata import UTF8_CLUES +from dataclasses import dataclass +import unicodedata + + +@dataclass +class CharData: + name: str + codept: int + encodings: list[tuple[str, int]] + + def sort_key(self) -> tuple[int, str, int]: + if self.name.startswith("LATIN "): + return (0, self.name, self.codept) + else: + return (1, "", self.codept) + + +SAFE_ENCODINGS = [ + "latin-1", + "windows-1252", + "windows-1251", + "windows-1250", + "windows-1253", + "windows-1254", + "windows-1257", +] + + +def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None: + char_data: list[CharData] = [] + for char in chars: + name = unicodedata.name(char, "") + codept = ord(char) + encodings: list[tuple[str, int]] = [] + for encoding in SAFE_ENCODINGS: + try: + encoded: bytes = char.encode(encoding) + byte: int = encoded[0] + encodings.append((encoding, byte)) + except UnicodeEncodeError: + pass + if encodings: + char_data.append(CharData(name=name, codept=codept, encodings=encodings)) + else: + print(f"No relevant encoding for {codept=}, {name=}") + char_data.sort(key=CharData.sort_key) + for cd in char_data: + encoding_info: list[str] = [] + for encoding, byte in cd.encodings: + if byte_min <= byte <= byte_max: + info_str = f"{encoding}:{byte:X}" + encoding_info.append(info_str) + if encoding_info: + encoding_explanation = encoding_info[0] + else: + encoding_explanation = "???" + print(f' "\\N{{{cd.name}}}" # {encoding_explanation}') + + +def run() -> None: + print("# utf8_first_of_2") + show_char_table(UTF8_CLUES["utf8_first_of_2"], 0xC2, 0xDF) + print("# utf8_first_of_3") + show_char_table(UTF8_CLUES["utf8_first_of_3"], 0xE0, 0xEF) + print("# utf8_first_of_4") + show_char_table(UTF8_CLUES["utf8_first_of_4"], 0xF0, 0xF3) + print("# utf8_continuation") + print(r' "\x80-\xbf"') + show_char_table(UTF8_CLUES["utf8_continuation"][3:], 0x80, 0xBF) + print("# utf8_continuation_strict") + print(r' "\x80-\xbf"') + show_char_table(UTF8_CLUES["utf8_continuation_strict"][3:], 0x80, 0xBF) + + +if __name__ == "__main__": + run() From 87afc6fccddc5cffac66f1f51138f30573e2c974 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 13:55:25 -0400 Subject: [PATCH 19/34] make chardata.py more readable --- ftfy/chardata.py | 404 +++++++++++++++++++++++++++++++++++-- scripts/char_data_table.py | 9 +- 2 files changed, 396 insertions(+), 17 deletions(-) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index dd33869..934fc76 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -252,37 +252,411 @@ def _build_width_map() -> dict[int, str]: # Character classes that help us pinpoint embedded mojibake. These can # include common characters, because we'll also check them for 'badness'. -UTF8_CLUES = { +# +# Though they go on for many lines, the members of this dictionary are +# single concatenated strings. +# +# This code is generated using scripts/char_data_table.py. +UTF8_CLUES: dict[str, str] = { # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding "utf8_first_of_2": ( - "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ" - "ĀĒŹĖĢĶĪĻŠŅŌŲŁŚŪŻŽ" - "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" + "\N{LATIN CAPITAL LETTER A WITH BREVE}" # windows-1250:C3 + "\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" # latin-1:C2 + "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" # latin-1:C4 + "\N{LATIN CAPITAL LETTER A WITH MACRON}" # windows-1257:C2 + "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" # latin-1:C5 + "\N{LATIN CAPITAL LETTER A WITH TILDE}" # latin-1:C3 + "\N{LATIN CAPITAL LETTER AE}" # latin-1:C6 + "\N{LATIN CAPITAL LETTER C WITH ACUTE}" # windows-1250:C6 + "\N{LATIN CAPITAL LETTER C WITH CARON}" # windows-1250:C8 + "\N{LATIN CAPITAL LETTER C WITH CEDILLA}" # latin-1:C7 + "\N{LATIN CAPITAL LETTER D WITH CARON}" # windows-1250:CF + "\N{LATIN CAPITAL LETTER D WITH STROKE}" # windows-1250:D0 + "\N{LATIN CAPITAL LETTER E WITH ACUTE}" # latin-1:C9 + "\N{LATIN CAPITAL LETTER E WITH CARON}" # windows-1250:CC + "\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" # latin-1:CA + "\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" # latin-1:CB + "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" # windows-1257:CB + "\N{LATIN CAPITAL LETTER E WITH GRAVE}" # latin-1:C8 + "\N{LATIN CAPITAL LETTER E WITH MACRON}" # windows-1257:C7 + "\N{LATIN CAPITAL LETTER E WITH OGONEK}" # windows-1250:CA + "\N{LATIN CAPITAL LETTER ETH}" # latin-1:D0 + "\N{LATIN CAPITAL LETTER G WITH BREVE}" # windows-1254:D0 + "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" # windows-1257:CC + "\N{LATIN CAPITAL LETTER I WITH ACUTE}" # latin-1:CD + "\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" # latin-1:CE + "\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" # latin-1:CF + "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" # windows-1254:DD + "\N{LATIN CAPITAL LETTER I WITH GRAVE}" # latin-1:CC + "\N{LATIN CAPITAL LETTER I WITH MACRON}" # windows-1257:CE + "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" # windows-1257:CD + "\N{LATIN CAPITAL LETTER L WITH ACUTE}" # windows-1250:C5 + "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" # windows-1257:CF + "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1257:D9 + "\N{LATIN CAPITAL LETTER N WITH ACUTE}" # windows-1250:D1 + "\N{LATIN CAPITAL LETTER N WITH CARON}" # windows-1250:D2 + "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" # windows-1257:D2 + "\N{LATIN CAPITAL LETTER N WITH TILDE}" # latin-1:D1 + "\N{LATIN CAPITAL LETTER O WITH ACUTE}" # latin-1:D3 + "\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" # latin-1:D4 + "\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" # latin-1:D6 + "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" # windows-1250:D5 + "\N{LATIN CAPITAL LETTER O WITH GRAVE}" # latin-1:D2 + "\N{LATIN CAPITAL LETTER O WITH MACRON}" # windows-1257:D4 + "\N{LATIN CAPITAL LETTER O WITH STROKE}" # latin-1:D8 + "\N{LATIN CAPITAL LETTER O WITH TILDE}" # latin-1:D5 + "\N{LATIN CAPITAL LETTER R WITH CARON}" # windows-1250:D8 + "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1257:DA + "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1257:D0 + "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1254:DE + "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" # windows-1250:DE + "\N{LATIN CAPITAL LETTER THORN}" # latin-1:DE + "\N{LATIN CAPITAL LETTER U WITH ACUTE}" # latin-1:DA + "\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" # latin-1:DB + "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" # latin-1:DC + "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" # windows-1250:DB + "\N{LATIN CAPITAL LETTER U WITH GRAVE}" # latin-1:D9 + "\N{LATIN CAPITAL LETTER U WITH MACRON}" # windows-1257:DB + "\N{LATIN CAPITAL LETTER U WITH OGONEK}" # windows-1257:D8 + "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" # windows-1250:D9 + "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" # latin-1:DD + "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1257:CA + "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1257:DE + "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1257:DD + "\N{LATIN SMALL LETTER SHARP S}" # latin-1:DF + "\N{MULTIPLICATION SIGN}" # latin-1:D7 + "\N{GREEK CAPITAL LETTER BETA}" # windows-1253:C2 + "\N{GREEK CAPITAL LETTER GAMMA}" # windows-1253:C3 + "\N{GREEK CAPITAL LETTER DELTA}" # windows-1253:C4 + "\N{GREEK CAPITAL LETTER EPSILON}" # windows-1253:C5 + "\N{GREEK CAPITAL LETTER ZETA}" # windows-1253:C6 + "\N{GREEK CAPITAL LETTER ETA}" # windows-1253:C7 + "\N{GREEK CAPITAL LETTER THETA}" # windows-1253:C8 + "\N{GREEK CAPITAL LETTER IOTA}" # windows-1253:C9 + "\N{GREEK CAPITAL LETTER KAPPA}" # windows-1253:CA + "\N{GREEK CAPITAL LETTER LAMDA}" # windows-1253:CB + "\N{GREEK CAPITAL LETTER MU}" # windows-1253:CC + "\N{GREEK CAPITAL LETTER NU}" # windows-1253:CD + "\N{GREEK CAPITAL LETTER XI}" # windows-1253:CE + "\N{GREEK CAPITAL LETTER OMICRON}" # windows-1253:CF + "\N{GREEK CAPITAL LETTER PI}" # windows-1253:D0 + "\N{GREEK CAPITAL LETTER RHO}" # windows-1253:D1 + "\N{GREEK CAPITAL LETTER SIGMA}" # windows-1253:D3 + "\N{GREEK CAPITAL LETTER TAU}" # windows-1253:D4 + "\N{GREEK CAPITAL LETTER UPSILON}" # windows-1253:D5 + "\N{GREEK CAPITAL LETTER PHI}" # windows-1253:D6 + "\N{GREEK CAPITAL LETTER CHI}" # windows-1253:D7 + "\N{GREEK CAPITAL LETTER PSI}" # windows-1253:D8 + "\N{GREEK CAPITAL LETTER OMEGA}" # windows-1253:D9 + "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" # windows-1253:DA + "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" # windows-1253:DB + "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" # windows-1253:DC + "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" # windows-1253:DD + "\N{GREEK SMALL LETTER ETA WITH TONOS}" # windows-1253:DE + "\N{GREEK SMALL LETTER IOTA WITH TONOS}" # windows-1253:DF + "\N{CYRILLIC CAPITAL LETTER VE}" # windows-1251:C2 + "\N{CYRILLIC CAPITAL LETTER GHE}" # windows-1251:C3 + "\N{CYRILLIC CAPITAL LETTER DE}" # windows-1251:C4 + "\N{CYRILLIC CAPITAL LETTER IE}" # windows-1251:C5 + "\N{CYRILLIC CAPITAL LETTER ZHE}" # windows-1251:C6 + "\N{CYRILLIC CAPITAL LETTER ZE}" # windows-1251:C7 + "\N{CYRILLIC CAPITAL LETTER I}" # windows-1251:C8 + "\N{CYRILLIC CAPITAL LETTER SHORT I}" # windows-1251:C9 + "\N{CYRILLIC CAPITAL LETTER KA}" # windows-1251:CA + "\N{CYRILLIC CAPITAL LETTER EL}" # windows-1251:CB + "\N{CYRILLIC CAPITAL LETTER EM}" # windows-1251:CC + "\N{CYRILLIC CAPITAL LETTER EN}" # windows-1251:CD + "\N{CYRILLIC CAPITAL LETTER O}" # windows-1251:CE + "\N{CYRILLIC CAPITAL LETTER PE}" # windows-1251:CF + "\N{CYRILLIC CAPITAL LETTER ER}" # windows-1251:D0 + "\N{CYRILLIC CAPITAL LETTER ES}" # windows-1251:D1 + "\N{CYRILLIC CAPITAL LETTER TE}" # windows-1251:D2 + "\N{CYRILLIC CAPITAL LETTER U}" # windows-1251:D3 + "\N{CYRILLIC CAPITAL LETTER EF}" # windows-1251:D4 + "\N{CYRILLIC CAPITAL LETTER HA}" # windows-1251:D5 + "\N{CYRILLIC CAPITAL LETTER TSE}" # windows-1251:D6 + "\N{CYRILLIC CAPITAL LETTER CHE}" # windows-1251:D7 + "\N{CYRILLIC CAPITAL LETTER SHA}" # windows-1251:D8 + "\N{CYRILLIC CAPITAL LETTER SHCHA}" # windows-1251:D9 + "\N{CYRILLIC CAPITAL LETTER HARD SIGN}" # windows-1251:DA + "\N{CYRILLIC CAPITAL LETTER YERU}" # windows-1251:DB + "\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" # windows-1251:DC + "\N{CYRILLIC CAPITAL LETTER E}" # windows-1251:DD + "\N{CYRILLIC CAPITAL LETTER YU}" # windows-1251:DE + "\N{CYRILLIC CAPITAL LETTER YA}" # windows-1251:DF ), # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding - "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕąįāęēźėģķīļΰαβγδεζηθικλμνξοабвгдежзийклмноп"), + "utf8_first_of_3": ( + "\N{LATIN SMALL LETTER A WITH ACUTE}" # latin-1:E1 + "\N{LATIN SMALL LETTER A WITH BREVE}" # windows-1250:E3 + "\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" # latin-1:E2 + "\N{LATIN SMALL LETTER A WITH DIAERESIS}" # latin-1:E4 + "\N{LATIN SMALL LETTER A WITH GRAVE}" # latin-1:E0 + "\N{LATIN SMALL LETTER A WITH MACRON}" # windows-1257:E2 + "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1257:E0 + "\N{LATIN SMALL LETTER A WITH RING ABOVE}" # latin-1:E5 + "\N{LATIN SMALL LETTER A WITH TILDE}" # latin-1:E3 + "\N{LATIN SMALL LETTER AE}" # latin-1:E6 + "\N{LATIN SMALL LETTER C WITH ACUTE}" # windows-1250:E6 + "\N{LATIN SMALL LETTER C WITH CARON}" # windows-1250:E8 + "\N{LATIN SMALL LETTER C WITH CEDILLA}" # latin-1:E7 + "\N{LATIN SMALL LETTER D WITH CARON}" # windows-1250:EF + "\N{LATIN SMALL LETTER E WITH ACUTE}" # latin-1:E9 + "\N{LATIN SMALL LETTER E WITH CARON}" # windows-1250:EC + "\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" # latin-1:EA + "\N{LATIN SMALL LETTER E WITH DIAERESIS}" # latin-1:EB + "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" # windows-1257:EB + "\N{LATIN SMALL LETTER E WITH GRAVE}" # latin-1:E8 + "\N{LATIN SMALL LETTER E WITH MACRON}" # windows-1257:E7 + "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA + "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA + "\N{LATIN SMALL LETTER G WITH CEDILLA}" # windows-1257:EC + "\N{LATIN SMALL LETTER I WITH ACUTE}" # latin-1:ED + "\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" # latin-1:EE + "\N{LATIN SMALL LETTER I WITH DIAERESIS}" # latin-1:EF + "\N{LATIN SMALL LETTER I WITH GRAVE}" # latin-1:EC + "\N{LATIN SMALL LETTER I WITH MACRON}" # windows-1257:EE + "\N{LATIN SMALL LETTER I WITH OGONEK}" # windows-1257:E1 + "\N{LATIN SMALL LETTER K WITH CEDILLA}" # windows-1257:ED + "\N{LATIN SMALL LETTER L WITH ACUTE}" # windows-1250:E5 + "\N{LATIN SMALL LETTER L WITH CEDILLA}" # windows-1257:EF + "\N{LATIN SMALL LETTER R WITH ACUTE}" # windows-1250:E0 + "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1257:EA + "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" # windows-1253:E0 + "\N{GREEK SMALL LETTER ALPHA}" # windows-1253:E1 + "\N{GREEK SMALL LETTER BETA}" # windows-1253:E2 + "\N{GREEK SMALL LETTER GAMMA}" # windows-1253:E3 + "\N{GREEK SMALL LETTER DELTA}" # windows-1253:E4 + "\N{GREEK SMALL LETTER EPSILON}" # windows-1253:E5 + "\N{GREEK SMALL LETTER ZETA}" # windows-1253:E6 + "\N{GREEK SMALL LETTER ETA}" # windows-1253:E7 + "\N{GREEK SMALL LETTER THETA}" # windows-1253:E8 + "\N{GREEK SMALL LETTER IOTA}" # windows-1253:E9 + "\N{GREEK SMALL LETTER KAPPA}" # windows-1253:EA + "\N{GREEK SMALL LETTER LAMDA}" # windows-1253:EB + "\N{GREEK SMALL LETTER MU}" # windows-1253:EC + "\N{GREEK SMALL LETTER NU}" # windows-1253:ED + "\N{GREEK SMALL LETTER XI}" # windows-1253:EE + "\N{GREEK SMALL LETTER OMICRON}" # windows-1253:EF + "\N{CYRILLIC SMALL LETTER A}" # windows-1251:E0 + "\N{CYRILLIC SMALL LETTER BE}" # windows-1251:E1 + "\N{CYRILLIC SMALL LETTER VE}" # windows-1251:E2 + "\N{CYRILLIC SMALL LETTER GHE}" # windows-1251:E3 + "\N{CYRILLIC SMALL LETTER DE}" # windows-1251:E4 + "\N{CYRILLIC SMALL LETTER IE}" # windows-1251:E5 + "\N{CYRILLIC SMALL LETTER ZHE}" # windows-1251:E6 + "\N{CYRILLIC SMALL LETTER ZE}" # windows-1251:E7 + "\N{CYRILLIC SMALL LETTER I}" # windows-1251:E8 + "\N{CYRILLIC SMALL LETTER SHORT I}" # windows-1251:E9 + "\N{CYRILLIC SMALL LETTER KA}" # windows-1251:EA + "\N{CYRILLIC SMALL LETTER EL}" # windows-1251:EB + "\N{CYRILLIC SMALL LETTER EM}" # windows-1251:EC + "\N{CYRILLIC SMALL LETTER EN}" # windows-1251:ED + "\N{CYRILLIC SMALL LETTER O}" # windows-1251:EE + "\N{CYRILLIC SMALL LETTER PE}" # windows-1251:EF + ), # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. # (Other leading bytes correspond only to unassigned codepoints) - "utf8_first_of_4": ("ðóđğπσруš"), + "utf8_first_of_4": ( + "\N{LATIN SMALL LETTER D WITH STROKE}" # windows-1250:F0 + "\N{LATIN SMALL LETTER ETH}" # latin-1:F0 + "\N{LATIN SMALL LETTER G WITH BREVE}" # windows-1254:F0 + "\N{LATIN SMALL LETTER O WITH ACUTE}" # latin-1:F3 + "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1257:F0 + "\N{GREEK SMALL LETTER PI}" # windows-1253:F0 + "\N{GREEK SMALL LETTER SIGMA}" # windows-1253:F3 + "\N{CYRILLIC SMALL LETTER ER}" # windows-1251:F0 + "\N{CYRILLIC SMALL LETTER U}" # windows-1251:F3 + ), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # including a space standing in for 0xA0 "utf8_continuation": ( "\x80-\xbf" - "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" - "ØŖÆøŗæ" - "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" - "–—―‘’‚“”„†‡•…‰‹›€№™" - " " + "\N{SPACE}" # modification of latin-1:A0, NO-BREAK SPACE + "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 + "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF + "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC + "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 + "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 + "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA + "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C + "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A + "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA + "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D + "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F + "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F + "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E + "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF + "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C + "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 + "\N{LATIN SMALL LETTER AE}" # windows-1257:BF + "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 + "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE + "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 + "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 + "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA + "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C + "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A + "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA + "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D + "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F + "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E + "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF + "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C + "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 + "\N{CARON}" # windows-1250:A1 + "\N{BREVE}" # windows-1250:A2 + "\N{OGONEK}" # windows-1250:B2 + "\N{SMALL TILDE}" # windows-1252:98 + "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD + "\N{GREEK TONOS}" # windows-1253:B4 + "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 + "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 + "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 + "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 + "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA + "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC + "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE + "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF + "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 + "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 + "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 + "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA + "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD + "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 + "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF + "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 + "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A + "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C + "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E + "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D + "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 + "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F + "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 + "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 + "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 + "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA + "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE + "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 + "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF + "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC + "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A + "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C + "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E + "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D + "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 + "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F + "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 + "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 + "\N{EN DASH}" # windows-1252:96 + "\N{EM DASH}" # windows-1252:97 + "\N{HORIZONTAL BAR}" # windows-1253:AF + "\N{LEFT SINGLE QUOTATION MARK}" # windows-1252:91 + "\N{RIGHT SINGLE QUOTATION MARK}" # windows-1252:92 + "\N{SINGLE LOW-9 QUOTATION MARK}" # windows-1252:82 + "\N{LEFT DOUBLE QUOTATION MARK}" # windows-1252:93 + "\N{RIGHT DOUBLE QUOTATION MARK}" # windows-1252:94 + "\N{DOUBLE LOW-9 QUOTATION MARK}" # windows-1252:84 + "\N{DAGGER}" # windows-1252:86 + "\N{DOUBLE DAGGER}" # windows-1252:87 + "\N{BULLET}" # windows-1252:95 + "\N{HORIZONTAL ELLIPSIS}" # windows-1252:85 + "\N{PER MILLE SIGN}" # windows-1252:89 + "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B + "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B + "\N{EURO SIGN}" # windows-1252:80 + "\N{NUMERO SIGN}" # windows-1251:B9 + "\N{TRADE MARK SIGN}" # windows-1252:99 ), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # and don't usually stand for themselves when adjacent to mojibake. # This excludes spaces, dashes, quotation marks, and ellipses. "utf8_continuation_strict": ( "\x80-\xbf" - "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" - "ØŖÆøŗæ" - "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" - "†‡•‰‹›€№™" + "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 + "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF + "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC + "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 + "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 + "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA + "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C + "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A + "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA + "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D + "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F + "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F + "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E + "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF + "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C + "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 + "\N{LATIN SMALL LETTER AE}" # windows-1257:BF + "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 + "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE + "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 + "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 + "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA + "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C + "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A + "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA + "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D + "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F + "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E + "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF + "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C + "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 + "\N{CARON}" # windows-1250:A1 + "\N{BREVE}" # windows-1250:A2 + "\N{OGONEK}" # windows-1250:B2 + "\N{SMALL TILDE}" # windows-1252:98 + "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD + "\N{GREEK TONOS}" # windows-1253:B4 + "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 + "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 + "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 + "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 + "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA + "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC + "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE + "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF + "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 + "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 + "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 + "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA + "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD + "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 + "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF + "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 + "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A + "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C + "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E + "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D + "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 + "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F + "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 + "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 + "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 + "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA + "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE + "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 + "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF + "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC + "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A + "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C + "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E + "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D + "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 + "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F + "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 + "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 + "\N{DAGGER}" # windows-1252:86 + "\N{DOUBLE DAGGER}" # windows-1252:87 + "\N{BULLET}" # windows-1252:95 + "\N{PER MILLE SIGN}" # windows-1252:89 + "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B + "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B + "\N{EURO SIGN}" # windows-1252:80 + "\N{NUMERO SIGN}" # windows-1251:B9 + "\N{TRADE MARK SIGN}" # windows-1252:99 ), } diff --git a/scripts/char_data_table.py b/scripts/char_data_table.py index cc174a4..78a5957 100644 --- a/scripts/char_data_table.py +++ b/scripts/char_data_table.py @@ -1,6 +1,11 @@ -from ftfy.chardata import UTF8_CLUES -from dataclasses import dataclass +""" +Used to regenerate character tables in ftfy/chardata.py with explanatory comments. +""" + import unicodedata +from dataclasses import dataclass + +from ftfy.chardata import UTF8_CLUES @dataclass From 7ff0548ad3e9270285a44abd64b4a8aca4393121 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 13:55:58 -0400 Subject: [PATCH 20/34] BULLET doesn't belong in utf8_continuation_strict --- ftfy/chardata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 934fc76..b45fb36 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -650,7 +650,6 @@ def _build_width_map() -> dict[int, str]: "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 "\N{DAGGER}" # windows-1252:86 "\N{DOUBLE DAGGER}" # windows-1252:87 - "\N{BULLET}" # windows-1252:95 "\N{PER MILLE SIGN}" # windows-1252:89 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B From aa434cfce7a9980dfc1fe4a069fe8fdc44a3e2bc Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 14:38:18 -0400 Subject: [PATCH 21/34] support cp850 mojibake --- docs/encodings.rst | 3 ++- ftfy/chardata.py | 3 ++- tests/test_cases.json | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/encodings.rst b/docs/encodings.rst index 69fa91d..b2d9804 100644 --- a/docs/encodings.rst +++ b/docs/encodings.rst @@ -14,7 +14,8 @@ ftfy can understand text that was decoded as any of these single-byte encodings: - Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries) - ISO-8859-2 (which is not quite the same as Windows-1250) - MacRoman (used on Mac OS 9 and earlier) -- cp437 (used in MS-DOS and some versions of the Windows command prompt) +- cp437 (used in MS-DOS, and some versions of the Windows command prompt, in the Americas) +- cp850 (used in MS-DOS, and some versions of the Windows command prompt, in Western Europe) when it was actually intended to be decoded as one of these variable-length encodings: diff --git a/ftfy/chardata.py b/ftfy/chardata.py index b45fb36..0da271a 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -23,6 +23,7 @@ "iso-8859-2", "macroman", "cp437", + "cp850", ] SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") @@ -570,7 +571,7 @@ def _build_width_map() -> dict[int, str]: ), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # and don't usually stand for themselves when adjacent to mojibake. - # This excludes spaces, dashes, quotation marks, and ellipses. + # This excludes spaces, dashes, 'bullet', quotation marks, and ellipses. "utf8_continuation_strict": ( "\x80-\xbf" "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 diff --git a/tests/test_cases.json b/tests/test_cases.json index 83f1d4b..e22994b 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -111,8 +111,27 @@ "fixed": "한국어", "expect": "pass" }, + { + "label": "Synthetic: Messy language name in cp850: Czech", + "original": "─îe┼ítina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Synthetic: Messy language name in cp850: Vietnamese", + "original": "Tiß║┐ng Viß╗çt", + "fixed": "Tiếng Việt", + "expect": "pass" + }, + { + "label": "Synthetic: Messy language name in cp850: Japanese", + "original": "µùѵ£¼Þ¬×", + "fixed": "日本語", + "expect": "pass" + }, { "label": "Low-codepoint emoji", + "comment": "From the ancient era before widespread emoji support on Twitter", "original": "He's Justinâ\u009d¤", "fixed": "He's Justin❤", "expect": "pass" From 1c185ef088f6eaeacff58a9a6706c8c063bcbbeb Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 16:41:12 -0400 Subject: [PATCH 22/34] Add more test cases --- CHANGELOG.md | 8 +++++++- ftfy/badness.py | 17 +++++++++------- tests/test_cases.json | 46 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab85e10..083261c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,13 @@ -## Version 6.3.0 (Octuber 8, 2024) +## Version 6.3.0 (October 8, 2024) - Switched packaging from poetry to uv. - Uses modern Python packaging exclusively (no setup.py). +- Added support for mojibake in Windows-1257 (Baltic) and codepage 850 (MS-DOS in Western Europe). +- Detects mojibake for "Ü" in an uppercase word, such as "ZURÜCK". +- Expanded a heuristic that notices improbable punctuation. +- Fixed a false positive involving two concatenated strings, one of which began with the § sign. +- Rewrote `chardata.py` to be more human-readable and debuggable, instead of being full of + keysmash-like character sets. ## Version 6.2.3 (August 5, 2024) diff --git a/ftfy/badness.py b/ftfy/badness.py index 6ac4a76..38ec1f4 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -42,8 +42,6 @@ "\N{DIAERESIS}" "\N{NOT SIGN}" "\N{MACRON}" - "\N{PILCROW SIGN}" - "\N{SECTION SIGN}" "\N{CEDILLA}" "\N{LATIN SMALL LETTER F WITH HOOK}" "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier @@ -62,6 +60,11 @@ "\N{FEMININE ORDINAL INDICATOR}" "\N{MASCULINE ORDINAL INDICATOR}" ), + # Characters used in legalese + "law": ( + "\N{PILCROW SIGN}" + "\N{SECTION SIGN}" + ), "currency": ( "\N{CENT SIGN}" "\N{POUND SIGN}" @@ -272,11 +275,11 @@ r""" [{c1}] | - [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}] + [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] [{bad}] | [a-zA-Z] [{lower_common}{upper_common}] [{bad}] | - [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] + [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] | [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] | @@ -286,11 +289,11 @@ | \s [{upper_accented}] [{currency}] | - [{upper_accented}{box}] [{numeric}] + [{upper_accented}{box}] [{numeric}{law}] | [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] | - [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}] + [{lower_accented}{upper_accented}{currency}{numeric}{box}{law}] [{end_punctuation}] [{start_punctuation}] | [{currency}{numeric}{box}] [{start_punctuation}] | @@ -298,7 +301,7 @@ | [{box}] [{kaomoji}] | - [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}] + [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}{law}] [{box}] | [{box}] [{end_punctuation}] | diff --git a/tests/test_cases.json b/tests/test_cases.json index e22994b..9c58572 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -311,6 +311,13 @@ "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", "expect": "pass" }, + { + "label": "UTF-8 / ISO-8859-2 mixup in Czech", + "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", + "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", + "fixed": "Mám dost třetího tisíciletí", + "expect": "pass" + }, { "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", "comment": "A difficult test case that can depend on the order that steps are applied", @@ -759,6 +766,20 @@ "fixed": "ongeëvenaard", "expect": "pass" }, + { + "label": "HTML entity on top of UTF-8 / Latin-1", + "original": "10μs", + "fixed-encoding": "10μs", + "fixed": "10μs", + "expect": "pass" + }, + { + "label": "Negative: Two concatenated strings", + "comment": "Should not turn into 'fratarak᧠141'", + "original": "Oborzos, per. Vahbarz, frataraká§ 141", + "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", + "expect": "pass" + }, { "label": "Negative: Indonesian leetspeak", "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", @@ -797,6 +818,24 @@ "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", "expect": "pass" }, + { + "label": "Latvian UTF-8 / MacRoman mojibake", + "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", + "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1257 mojibake", + "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", + "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1250 mojibake", + "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", + "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", + "expect": "pass" + }, { "label": "Hebrew UTF-8 / Windows-1252 mojibake", "comment": "reported by SuperIRabbit as issue #158", @@ -804,6 +843,13 @@ "fixed": "בהודעה", "expect": "pass" }, + { + "label": "Wide comma in UTF-8 / Windows-1252", + "original": "Ningbo,China", + "fixed-encoding": "Ningbo,China", + "fixed": "Ningbo,China", + "expect": "pass" + }, { "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", "original": "בהודעה", From 8b791317cab335628bb216d5f3ae5e97cf54dcbf Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 17:02:56 -0400 Subject: [PATCH 23/34] back out support for cp850, it doesn't work well --- CHANGELOG.md | 2 +- docs/encodings.rst | 5 +++-- ftfy/chardata.py | 1 - notes/mysteries.txt | 10 ++++++++++ tests/test_cases.json | 10 ++-------- 5 files changed, 16 insertions(+), 12 deletions(-) create mode 100644 notes/mysteries.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 083261c..dd63183 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ - Switched packaging from poetry to uv. - Uses modern Python packaging exclusively (no setup.py). -- Added support for mojibake in Windows-1257 (Baltic) and codepage 850 (MS-DOS in Western Europe). +- Added support for mojibake in Windows-1257 (Baltic). - Detects mojibake for "Ü" in an uppercase word, such as "ZURÜCK". - Expanded a heuristic that notices improbable punctuation. - Fixed a false positive involving two concatenated strings, one of which began with the § sign. diff --git a/docs/encodings.rst b/docs/encodings.rst index b2d9804..b0513a7 100644 --- a/docs/encodings.rst +++ b/docs/encodings.rst @@ -14,8 +14,7 @@ ftfy can understand text that was decoded as any of these single-byte encodings: - Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries) - ISO-8859-2 (which is not quite the same as Windows-1250) - MacRoman (used on Mac OS 9 and earlier) -- cp437 (used in MS-DOS, and some versions of the Windows command prompt, in the Americas) -- cp850 (used in MS-DOS, and some versions of the Windows command prompt, in Western Europe) +- cp437 (it's the "text mode" in your video card firmware) when it was actually intended to be decoded as one of these variable-length encodings: @@ -28,6 +27,8 @@ However, ftfy cannot understand other mixups between single-byte encodings, beca We also can't handle the legacy encodings used for Chinese, Japanese, and Korean, such as ``shift-jis`` and ``gb18030``. See `issue #34`_ for why this is so hard. +I tried adding support for cp850, the cp437-workalike that supported European languages, but I couldn't find any real examples that it fixed, and it introduced some false positives. + .. _`issue #34`: https://github.com/rspeer/python-ftfy/issues/34 Remember that the input to ftfy is Unicode, so it handles actual CJK *text* just fine. It just can't discover that a CJK *encoding* introduced mojibake into the text. diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 0da271a..afcc767 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -23,7 +23,6 @@ "iso-8859-2", "macroman", "cp437", - "cp850", ] SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") diff --git a/notes/mysteries.txt b/notes/mysteries.txt new file mode 100644 index 0000000..23e4a9e --- /dev/null +++ b/notes/mysteries.txt @@ -0,0 +1,10 @@ +on https://www.nipette.com/article-6358031.html, a comment is signed 'MÃ\x83©Ã\x82¬Ã\x82¡nie'. +This happens to be triple-UTF-8 for 'M鬡nie', but that's probably not the name they meant. + +What exactly did https://www.horoskopy-horoskop.cz/clanek/431-numerologicky-vyznam-jmena-jaromir +mean when they said 'TadeÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂáÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂá' ? + +https://mtlurb.com/tags/arbres/ +'montrã©al' probably isn't in cp850, but what is it? + + diff --git a/tests/test_cases.json b/tests/test_cases.json index 9c58572..005dab2 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -112,23 +112,17 @@ "expect": "pass" }, { - "label": "Synthetic: Messy language name in cp850: Czech", + "label": "Synthetic: Messy language name in cp437: Czech", "original": "─îe┼ítina", "fixed": "Čeština", "expect": "pass" }, { - "label": "Synthetic: Messy language name in cp850: Vietnamese", + "label": "Synthetic: Messy language name in cp437: Vietnamese", "original": "Tiß║┐ng Viß╗çt", "fixed": "Tiếng Việt", "expect": "pass" }, - { - "label": "Synthetic: Messy language name in cp850: Japanese", - "original": "µùѵ£¼Þ¬×", - "fixed": "日本語", - "expect": "pass" - }, { "label": "Low-codepoint emoji", "comment": "From the ancient era before widespread emoji support on Twitter", From 8ac691ce209543d98082911f609bcb5dcd9b6026 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 17:20:33 -0400 Subject: [PATCH 24/34] try to fix github publishing --- .github/workflows/publish.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5488842..c092f2e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -35,5 +35,3 @@ jobs: - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages_dir: artifact/ From 38b966756ffd348954621be1817faa152a8b7851 Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Thu, 10 Oct 2024 17:31:47 -0400 Subject: [PATCH 25/34] replace my publish config with the guide --- .github/workflows/publish.yml | 121 ++++++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 19 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c092f2e..fd2da55 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,37 +1,120 @@ -name: Publish +name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI + on: push: tags: - "v*" + jobs: build: + name: Build distribution 📦 runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 with: - python-version: 3.11 + name: python-package-distributions + path: dist/ - - run: | - pip install hatchling - hatchling build + publish-to-pypi: + name: >- + Publish Python 🐍 distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/ftfy + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing - - uses: actions/upload-artifact@v3 + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 with: - path: ./dist - - pypi-publish: - needs: ["build"] - environment: "publish" + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 - name: upload release to PyPI + github-release: + name: >- + Sign the Python 🐍 distribution 📦 with Sigstore + and upload them to GitHub Release + needs: + - publish-to-pypi runs-on: ubuntu-latest + permissions: - # IMPORTANT: this permission is mandatory for trusted publishing - id-token: write + contents: write # IMPORTANT: mandatory for making GitHub Releases + id-token: write # IMPORTANT: mandatory for sigstore + steps: - - uses: actions/download-artifact@v4.1.7 + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' + + publish-to-testpypi: + name: Publish Python 🐍 distribution 📦 to TestPyPI + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/ftfy - - name: Publish package distributions to PyPI + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ From cbb28da8484d68192018e2f5dcb2eb104e4e46dc Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Fri, 11 Oct 2024 00:47:47 -0400 Subject: [PATCH 26/34] fix outdated/irrelevant parts of publish.yml --- .github/workflows/publish.yml | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index fd2da55..ede12d9 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,4 +1,4 @@ -name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI +name: Publish Python distribution 📦 to PyPI on: push: @@ -32,7 +32,7 @@ jobs: publish-to-pypi: name: >- - Publish Python 🐍 distribution 📦 to PyPI + Publish Python distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - build @@ -54,7 +54,7 @@ jobs: github-release: name: >- - Sign the Python 🐍 distribution 📦 with Sigstore + Sign the Python distribution 📦 with Sigstore and upload them to GitHub Release needs: - publish-to-pypi @@ -71,7 +71,7 @@ jobs: name: python-package-distributions path: dist/ - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v2.1.1 + uses: sigstore/gh-action-sigstore-python@v3.0.0 with: inputs: >- ./dist/*.tar.gz @@ -94,27 +94,3 @@ jobs: gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}' - - publish-to-testpypi: - name: Publish Python 🐍 distribution 📦 to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/ftfy - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ From 5961126af6334b6bc47c4808a97cdd7889f931f1 Mon Sep 17 00:00:00 2001 From: arborelia Date: Fri, 25 Oct 2024 20:44:40 -0400 Subject: [PATCH 27/34] packaging updates for v6.3.1 --- CHANGELOG.md | 5 +++++ README.md | 2 +- ftfy/__init__.py | 22 ++++++++++++++++------ pyproject.toml | 26 +++++++++++--------------- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd63183..28f6c20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## Version 6.3.1 (October 25, 2024) + +- Fixed `license` metadata field in pyproject.toml. +- Removed extraneous files from the `hatchling` sdist output. + ## Version 6.3.0 (October 8, 2024) - Switched packaging from poetry to uv. diff --git a/README.md b/README.md index 7b5de26..a138cfc 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ The following text could be encoded in Windows-1252 and decoded in UTF-8, and it ## Installing -ftfy is a Python 3 package that can be installed using `pip`: +ftfy is a Python 3 package that can be installed using `pip` or `uv pip`: pip install ftfy diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 15d280e..cc0a120 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -24,7 +24,7 @@ from ftfy.badness import is_bad from ftfy.formatting import display_ljust -__version__ = "6.3.0" +__version__ = "6.3.1" # Though this function does nothing, it lets linters know that we're using @@ -227,7 +227,9 @@ class TextFixerConfig(NamedTuple): explain: bool = True -def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: +def _config_from_kwargs( + config: TextFixerConfig, kwargs: dict[str, Any] +) -> TextFixerConfig: """ Handle parameters provided as keyword arguments to ftfy's top-level functions, converting them into a TextFixerConfig. @@ -463,7 +465,9 @@ def fix_encoding_and_explain( return ExplainedText(text, plan_so_far) -def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: +def _fix_encoding_one_step_and_explain( + text: str, config: TextFixerConfig +) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ @@ -509,7 +513,9 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) + transcode_steps.append( + ExplanationStep("transcode", "restore_byte_a0") + ) encoded_bytes = replaced_bytes # Replace sequences where information has been lost @@ -577,7 +583,9 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex return ExplainedText(text, []) -def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: +def fix_encoding( + text: str, config: TextFixerConfig | None = None, **kwargs: Any +) -> str: """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -598,7 +606,9 @@ def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any ftfy = fix_text -def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: +def fix_text_segment( + text: str, config: TextFixerConfig | None = None, **kwargs: Any +) -> str: """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. diff --git a/pyproject.toml b/pyproject.toml index b325bad..9588f31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,20 @@ [project] name = "ftfy" -version = "6.3.0" +version = "6.3.1" description = "Fixes mojibake and other problems with Unicode, after the fact" -homepage = "https://ftfy.readthedocs.io/en/latest/" -documentation = "https://ftfy.readthedocs.io/en/latest/" -repository = "https://github.com/rspeer/python-ftfy" -authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net"}] -license = "Apache-2.0" -include = [ - { path = "README.md", format = "sdist" }, - { path = "CHANGELOG.md", format = "sdist" }, - { path = "tests", format = "sdist" }, -] +authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net" }] +license = { text = "Apache-2.0" } readme = "README.md" -dependencies = [ - "wcwidth" -] +dependencies = ["wcwidth"] requires-python = ">=3.9" [project.scripts] ftfy = "ftfy.cli:main" [project.urls] +Homepage = "https://ftfy.readthedocs.io/en/latest/" +Documentation = "https://ftfy.readthedocs.io/en/latest/" +Repository = "https://github.com/rspeer/python-ftfy" Issues = "https://github.com/rspeer/python-ftfy/issues/" Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md" Blog = "https://posts.arborelia.net" @@ -30,12 +23,15 @@ Blog = "https://posts.arborelia.net" requires = ["hatchling"] build-backend = "hatchling.build" +[tool.hatch.build.targets.sdist] +exclude = ["^.github/", "scripts/", ".readthedocs.yaml", "notes/", "notebook/"] + [tool.uv] dev-dependencies = [ "Sphinx >=7, <8", "furo >= 2024.7.18", "pytest >= 8.3.2, < 9", - "ruff" + "ruff", ] [tool.ruff] From 5340af6746ff655a9cd7cb2b50c2fd0b35bb91d3 Mon Sep 17 00:00:00 2001 From: arborelia Date: Fri, 25 Oct 2024 20:47:58 -0400 Subject: [PATCH 28/34] version number updates --- docs/conf.py | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 822883e..0aef57b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,7 +48,7 @@ # The short X.Y version. version = "6.3" # The full version, including alpha/beta/rc tags. -release = "6.3.0rc1" +release = "6.3.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/uv.lock b/uv.lock index 6991a31..438359f 100644 --- a/uv.lock +++ b/uv.lock @@ -153,7 +153,7 @@ wheels = [ [[package]] name = "ftfy" -version = "6.3.0" +version = "6.3.1" source = { editable = "." } dependencies = [ { name = "wcwidth" }, From 67ab5ee51a599fee508c883f5b7f8d0b4b1749e4 Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 01:39:31 -0400 Subject: [PATCH 29/34] enable more Ruff code suggestions --- ftfy/__init__.py | 2 +- ftfy/chardata.py | 6 ++++-- pyproject.toml | 9 +++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index cc0a120..880fb05 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -531,7 +531,7 @@ def _fix_encoding_one_step_and_explain( decoding = "utf-8-variants" decode_step = ExplanationStep("decode", decoding) - steps = [encode_step] + transcode_steps + [decode_step] + steps = [encode_step, *transcode_steps, decode_step] fixed = encoded_bytes.decode(decoding) return ExplainedText(fixed, steps) diff --git a/ftfy/chardata.py b/ftfy/chardata.py index afcc767..ea199d0 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -43,7 +43,7 @@ def _build_regexes() -> dict[str, re.Pattern[str]]: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. - byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) + byte_range = bytes([*range(0x80, 0x100), 0x1A]) charlist = byte_range.decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B @@ -686,6 +686,8 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ - """.format(**UTF8_CLUES), + """.format( + **UTF8_CLUES + ), re.VERBOSE, ) diff --git a/pyproject.toml b/pyproject.toml index 9588f31..7b65007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,8 +40,13 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP"] -ignore = ["ANN101", "ANN401"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF"] +ignore = [ + "ANN101", + "ANN401", + "RUF001", # complains about Unicode characters that belong in my docstrings + "RUF002", # complains about Unicode characters that belong in my docstrings +] [tool.ruff.lint.per-file-ignores] "tests/*" = ["ANN"] From 4ecbb33eda53b67426b6f712334f23b899faf20e Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 01:42:59 -0400 Subject: [PATCH 30/34] mypy checks and formatting --- ftfy/__init__.py | 20 +++++--------------- ftfy/bad_codecs/sloppy.py | 2 +- ftfy/bad_codecs/utf8_variants.py | 2 +- ftfy/chardata.py | 4 +--- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 880fb05..4d48fba 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -227,9 +227,7 @@ class TextFixerConfig(NamedTuple): explain: bool = True -def _config_from_kwargs( - config: TextFixerConfig, kwargs: dict[str, Any] -) -> TextFixerConfig: +def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: """ Handle parameters provided as keyword arguments to ftfy's top-level functions, converting them into a TextFixerConfig. @@ -465,9 +463,7 @@ def fix_encoding_and_explain( return ExplainedText(text, plan_so_far) -def _fix_encoding_one_step_and_explain( - text: str, config: TextFixerConfig -) -> ExplainedText: +def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ @@ -513,9 +509,7 @@ def _fix_encoding_one_step_and_explain( ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append( - ExplanationStep("transcode", "restore_byte_a0") - ) + transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost @@ -583,9 +577,7 @@ def _fix_encoding_one_step_and_explain( return ExplainedText(text, []) -def fix_encoding( - text: str, config: TextFixerConfig | None = None, **kwargs: Any -) -> str: +def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -606,9 +598,7 @@ def fix_encoding( ftfy = fix_text -def fix_text_segment( - text: str, config: TextFixerConfig | None = None, **kwargs: Any -) -> str: +def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. diff --git a/ftfy/bad_codecs/sloppy.py b/ftfy/bad_codecs/sloppy.py index 656f01c..8c65e4f 100644 --- a/ftfy/bad_codecs/sloppy.py +++ b/ftfy/bad_codecs/sloppy.py @@ -143,7 +143,7 @@ class StreamReader(Codec, codecs.StreamReader): return codecs.CodecInfo( name="sloppy-" + encoding, encode=Codec().encode, - decode=Codec().decode, + decode=Codec().decode, # type: ignore[arg-type] incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index c15a3cf..57807cc 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -248,7 +248,7 @@ def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: CODEC_INFO = codecs.CodecInfo( name=NAME, encode=StreamWriter.encode, - decode=StreamReader.decode, + decode=StreamReader.decode, # type: ignore[arg-type] incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, diff --git a/ftfy/chardata.py b/ftfy/chardata.py index ea199d0..43d117c 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -686,8 +686,6 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ - """.format( - **UTF8_CLUES - ), + """.format(**UTF8_CLUES), re.VERBOSE, ) From c033c1ace1e92c934f98d59b6e454a663ba683ca Mon Sep 17 00:00:00 2001 From: arborelia Date: Sat, 26 Oct 2024 02:10:44 -0400 Subject: [PATCH 31/34] incorporate some ruff code suggestions --- ftfy/__init__.py | 19 ++++++---- ftfy/bad_codecs/utf8_variants.py | 60 ++++++++++++++------------------ ftfy/cli.py | 10 +++--- ftfy/formatting.py | 9 +++-- pyproject.toml | 3 +- scripts/char_data_table.py | 8 ++--- tests/test_cli.py | 7 ++-- tests/test_examples_in_json.py | 8 ++--- 8 files changed, 61 insertions(+), 63 deletions(-) diff --git a/ftfy/__init__.py b/ftfy/__init__.py index 4d48fba..fb66698 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -9,8 +9,8 @@ import unicodedata import warnings -from collections.abc import Iterator from typing import ( + TYPE_CHECKING, Any, BinaryIO, Callable, @@ -24,6 +24,9 @@ from ftfy.badness import is_bad from ftfy.formatting import display_ljust +if TYPE_CHECKING: + from collections.abc import Iterator + __version__ = "6.3.1" @@ -241,8 +244,7 @@ def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> Text kwargs = kwargs.copy() kwargs["unescape_html"] = kwargs["fix_entities"] del kwargs["fix_entities"] - config = config._replace(**kwargs) - return config + return config._replace(**kwargs) BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. @@ -669,12 +671,13 @@ def guess_bytes(bstring: bytes) -> tuple[str, str]: single-byte encoding. """ if isinstance(bstring, str): - raise UnicodeError( + msg = ( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) + raise UnicodeError(msg) - if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): + if bstring.startswith((b"\xfe\xff", b"\xff\xfe")): return bstring.decode("utf-16"), "utf-16" byteset = set(bstring) @@ -748,9 +751,11 @@ def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: if encoding in FIXERS: obj = FIXERS[encoding](obj) else: - raise ValueError(f"Unknown function to apply: {encoding}") + msg = f"Unknown function to apply: {encoding}" + raise ValueError(msg) else: - raise ValueError(f"Unknown plan step: {operation}") + msg = f"Unknown plan step: {operation}" + raise ValueError(msg) return obj diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index 57807cc..eaac3c1 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -166,17 +166,14 @@ def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tup if len(input) > 1: # Decode the two-byte sequence 0xc0 0x80. return "\u0000", 2 - else: - if final: - # We hit the end of the stream. Let the superclass method - # handle it. - return sup(input, errors, True) - else: - # Wait to see another byte. - return "", 0 - else: - # Decode a possible six-byte sequence starting with 0xed. - return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) + if final: + # We hit the end of the stream. Let the superclass method + # handle it. + return sup(input, errors, True) + # Wait to see another byte. + return "", 0 + # Decode a possible six-byte sequence starting with 0xed. + return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) @staticmethod def _buffer_decode_surrogates( @@ -205,28 +202,25 @@ def _buffer_decode_surrogates( # handle it as normal UTF-8. It might be a Hangul character # or an error. return sup(input, errors, final) - else: - # We found a surrogate, the stream isn't over yet, and we don't - # know enough of the following bytes to decode anything, so - # consume zero bytes and wait. - return "", 0 - else: - if CESU8_RE.match(input): - # Given this is a CESU-8 sequence, do some math to pull out - # the intended 20-bit value, and consume six bytes. - codepoint = ( - ((input[1] & 0x0F) << 16) - + ((input[2] & 0x3F) << 10) - + ((input[4] & 0x0F) << 6) - + (input[5] & 0x3F) - + 0x10000 - ) - return chr(codepoint), 6 - else: - # This looked like a CESU-8 sequence, but it wasn't one. - # 0xed indicates the start of a three-byte sequence, so give - # three bytes to the superclass to decode as usual. - return sup(input[:3], errors, False) + # We found a surrogate, the stream isn't over yet, and we don't + # know enough of the following bytes to decode anything, so + # consume zero bytes and wait. + return "", 0 + if CESU8_RE.match(input): + # Given this is a CESU-8 sequence, do some math to pull out + # the intended 20-bit value, and consume six bytes. + codepoint = ( + ((input[1] & 0x0F) << 16) + + ((input[2] & 0x3F) << 10) + + ((input[4] & 0x0F) << 6) + + (input[5] & 0x3F) + + 0x10000 + ) + return chr(codepoint), 6 + # This looked like a CESU-8 sequence, but it wasn't one. + # 0xed indicates the start of a three-byte sequence, so give + # three bytes to the superclass to decode as usual. + return sup(input[:3], errors, False) # The encoder is identical to UTF-8. diff --git a/ftfy/cli.py b/ftfy/cli.py index 2807a86..16f3296 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -4,6 +4,7 @@ import os import sys +from pathlib import Path from typing import Union from ftfy import TextFixerConfig, __version__, fix_file @@ -101,7 +102,7 @@ def main() -> None: # whatever encoding is necessary. file = sys.stdin.buffer else: - file = open(args.filename, "rb") + file = Path(args.filename).open("rb") if args.output == "-": outfile = sys.stdout @@ -109,17 +110,14 @@ def main() -> None: if os.path.realpath(args.output) == os.path.realpath(args.filename): sys.stderr.write(SAME_FILE_ERROR_TEXT) sys.exit(1) - outfile = open(args.output, "w", encoding="utf-8") + outfile = Path(args.output).open("w", encoding="utf-8") normalization = args.normalization if normalization.lower() == "none": normalization = None unescape_html: Union[str, bool] - if args.preserve_entities: - unescape_html = False - else: - unescape_html = "auto" + unescape_html = False if args.preserve_entities else "auto" config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization) diff --git a/ftfy/formatting.py b/ftfy/formatting.py index 18df64b..4295558 100644 --- a/ftfy/formatting.py +++ b/ftfy/formatting.py @@ -99,7 +99,8 @@ def display_ljust(text: str, width: int, fillchar: str = " ") -> str: correct if you're viewing this code or documentation in a Web browser. """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: @@ -129,7 +130,8 @@ def display_rjust(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒▒▒▒▒ちゃぶ台返し """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: @@ -154,7 +156,8 @@ def display_center(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒ちゃぶ台返し▒▒▒▒ """ if character_width(fillchar) != 1: - raise ValueError("The padding character must have display width 1") + msg = "The padding character must have display width 1" + raise ValueError(msg) text_width = monospaced_width(text) if text_width == -1: diff --git a/pyproject.toml b/pyproject.toml index 7b65007..13aeee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,12 +40,13 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP", "RUF"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH"] ignore = [ "ANN101", "ANN401", "RUF001", # complains about Unicode characters that belong in my docstrings "RUF002", # complains about Unicode characters that belong in my docstrings + "PIE808", # explicitly starting ranges at 0 sometimes helps with readability ] [tool.ruff.lint.per-file-ignores] diff --git a/scripts/char_data_table.py b/scripts/char_data_table.py index 78a5957..d063d1a 100644 --- a/scripts/char_data_table.py +++ b/scripts/char_data_table.py @@ -17,8 +17,7 @@ class CharData: def sort_key(self) -> tuple[int, str, int]: if self.name.startswith("LATIN "): return (0, self.name, self.codept) - else: - return (1, "", self.codept) + return (1, "", self.codept) SAFE_ENCODINGS = [ @@ -56,10 +55,7 @@ def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None if byte_min <= byte <= byte_max: info_str = f"{encoding}:{byte:X}" encoding_info.append(info_str) - if encoding_info: - encoding_explanation = encoding_info[0] - else: - encoding_explanation = "???" + encoding_explanation = encoding_info[0] if encoding_info else "???" print(f' "\\N{{{cd.name}}}" # {encoding_explanation}') diff --git a/tests/test_cli.py b/tests/test_cli.py index 0b3d107..a862e31 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,11 +1,12 @@ import os import subprocess +from pathlib import Path import pytest # Get the filename of 'face.txt', an example of mojibake -THIS_DIR = os.path.dirname(__file__) -TEST_FILENAME = os.path.join(THIS_DIR, "face.txt") +THIS_DIR = Path(__file__).parent +TEST_FILENAME = THIS_DIR / "face.txt" CORRECT_OUTPUT = os.linesep.join(["┒(⌣˛⌣)┎", ""]) FAILED_OUTPUT = os.linesep.join( [ @@ -61,6 +62,6 @@ def test_same_file(): def test_stdin(): - with open(TEST_FILENAME, "rb") as infile: + with TEST_FILENAME.open("rb") as infile: output = get_command_output(["ftfy"], stdin=infile) assert output == CORRECT_OUTPUT diff --git a/tests/test_examples_in_json.py b/tests/test_examples_in_json.py index cf99e27..83dcb8e 100644 --- a/tests/test_examples_in_json.py +++ b/tests/test_examples_in_json.py @@ -25,15 +25,15 @@ """ import json -import os +from pathlib import Path import pytest from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text -THIS_DIR = os.path.dirname(__file__) -TEST_FILENAME = os.path.join(THIS_DIR, "test_cases.json") -TEST_DATA = json.load(open(TEST_FILENAME, encoding="utf-8")) +THIS_DIR = Path(__file__).parent +TEST_FILENAME = THIS_DIR / "test_cases.json" +TEST_DATA = json.load(TEST_FILENAME.open(encoding="utf-8")) TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"] From 8a55920c5eb1ea00c80ffd8ce107015ed1c0a80e Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Wed, 30 Oct 2024 16:43:44 -0400 Subject: [PATCH 32/34] reorganize test cases --- tests/test-cases/in-the-wild.json | 451 +++++++++++ tests/test-cases/known-failures.json | 70 ++ tests/test-cases/language-names.json | 127 +++ tests/test-cases/negative.json | 216 ++++++ tests/test-cases/synthetic.json | 208 +++++ tests/test_cases.json | 1061 -------------------------- 6 files changed, 1072 insertions(+), 1061 deletions(-) create mode 100644 tests/test-cases/in-the-wild.json create mode 100644 tests/test-cases/known-failures.json create mode 100644 tests/test-cases/language-names.json create mode 100644 tests/test-cases/negative.json create mode 100644 tests/test-cases/synthetic.json delete mode 100644 tests/test_cases.json diff --git a/tests/test-cases/in-the-wild.json b/tests/test-cases/in-the-wild.json new file mode 100644 index 0000000..b40c838 --- /dev/null +++ b/tests/test-cases/in-the-wild.json @@ -0,0 +1,451 @@ +[ + { + "label": "Low-codepoint emoji", + "comment": "From the ancient era before widespread emoji support on Twitter", + "original": "He's Justinâ\u009d¤", + "fixed": "He's Justin❤", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mix-up about smurfs", + "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", + "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", + "expect": "pass" + }, + { + "label": "Checkmark that almost looks okay as mojibake", + "original": "✔ No problems", + "fixed": "✔ No problems", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 Russian mixup about futbol", + "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", + "fixed": "дороге Из-под #футбол", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in German", + "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", + "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", + "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup of the replacement character", + "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "expect": "pass" + }, + { + "label": "CESU-8 / Windows-1252 emoji", + "original": "Hi guys í ½í¸\u008d", + "fixed": "Hi guys 😍", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 emoji", + "original": "hihi RT username: â\u0098ºí ½í¸\u0098", + "fixed": "hihi RT username: ☺😘", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Turkish", + "original": "Beta Haber: Hırsızı Büyü Korkuttu", + "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", + "original": "İstanbul", + "fixed": "İstanbul", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", + "original": "RUF MICH ZURÜCK", + "fixed": "RUF MICH ZURÜCK", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", + "original": "RÄ«ga", + "fixed": "Rīga", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixed up twice in Russian", + "original": "приятности. РІСњВ¤", + "fixed": "приятности. ❤", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in Malay", + "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", + "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", + "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", + "original": "Iggy Pop (né Jim Osterberg)", + "fixed": "Iggy Pop (né Jim Osterberg)", + "expect": "pass" + }, + { + "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", + "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", + "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", + "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", + "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", + "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up three times", + "original": "The Mona Lisa doesn’t have eyebrows.", + "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", + "fixed": "The Mona Lisa doesn't have eyebrows.", + "expect": "pass" + }, + { + "label": "UTF-8 / Codepage 437 mixup in Russian", + "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", + "fixed": "#правильноепитание", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in French", + "original": "Hôtel de Police", + "fixed": "Hôtel de Police", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in French", + "original": "Liège Avenue de l'HĂ´pital", + "fixed": "Liège Avenue de l'Hôpital", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Vietnamese", + "original": "Tại sao giá hạt sầu riêng lại lên giá?", + "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", + "expect": "pass" + }, + { + "label": "Science! Mid-word Greek letter gets fixed correctly", + "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "expect": "pass" + }, + { + "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", + "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", + "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", + "expect": "pass" + }, + { + "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", + "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", + "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in English", + "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", + "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "UTF-8 / ISO-8859-2 mixup in Czech", + "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", + "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", + "fixed": "Mám dost třetího tisíciletí", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", + "comment": "A difficult test case that can depend on the order that steps are applied", + "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", + "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", + "fixed": "vedere înceţoşată", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in Slovak", + "original": "NapĂ\u00adšte nám !", + "fixed": "Napíšte nám !", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Spanish", + "original": "DOS AÑOS", + "fixed": "DOS AÑOS", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", + "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "expect": "pass" + }, + { + "label": "fancy Unicode crossing-out, but mojibaked", + "original": "hotel $49 $̶6̶3̶ updated 2018", + "fixed": "hotel $49 $̶6̶3̶ updated 2018", + "expect": "pass" + }, + { + "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", + "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", + "fixed": "┒(⌣˛⌣)┎", + "expect": "pass" + }, + { + "label": "We can mostly decode the face above when we lose the character U+009D", + "original": "ââ€�’(⌣˛⌣)ââ€�Ž", + "fixed": "�(⌣˛⌣)�", + "expect": "pass" + }, + { + "label": "Lossy decoding can have plain ASCII question marks, as well", + "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", + "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", + "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 mixup over several emoji", + "comment": "You tried", + "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", + "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", + "expect": "pass" + }, + { + "label": "An absolutely hopeless garble", + "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", + "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent mojibake in Portuguese", + "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "expect": "pass" + }, + { + "label": "Handle Afrikaans 'n character", + "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", + "expect": "pass" + }, + { + "label": "Handle Croatian single-codepoint digraphs", + "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "expect": "pass" + }, + { + "label": "A with an acute accent, in isolation", + "original": "Nicolás", + "fixed": "Nicolás", + "expect": "pass" + }, + { + "label": "sharp S, in isolation, via MacRoman encoding", + "comment": "regression reported in issue #186", + "original": "wei√ü", + "fixed": "weiß", + "expect": "pass" + }, + { + "label": "French example containing non-breaking spaces", + "original": "ART TRIP Ã\u00a0 l'office de tourisme", + "fixed": "ART TRIP à l'office de tourisme", + "expect": "pass" + }, + { + "label": "English example in UTF-8 / Windows-1251 with a ligature", + "original": "This is signiп¬Ѓcantly lower than the respective share", + "fixed-encoding": "This is significantly lower than the respective share", + "fixed": "This is significantly lower than the respective share", + "expect": "pass" + }, + { + "label": "'à' remains its own word, even if spaces after it get coalesced into one", + "original": "à perturber la réflexion des théologiens jusqu'à nos jours", + "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", + "expect": "pass" + }, + { + "label": "Fix 'à' in inconsistent mojibake", + "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", + "expect": "pass" + }, + { + "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", + "original": "com especial atenção à s crianças", + "fixed": "com especial atenção às crianças", + "expect": "pass" + }, + { + "label": "This is why we require a space after the 's' in 'às'", + "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "expect": "pass" + }, + { + "label": "We can fix 'à' in windows-1251 sometimes as well", + "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", + "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", + "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", + "expect": "pass" + }, + { + "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", + "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", + "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", + "expect": "pass" + }, + { + "label": "A complex, lossy pile-up of mojibake in Portuguese", + "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", + "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", + "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", + "fixed": "Cànan nan Gàidheal", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixup in tweet spam", + "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixup", + "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", + "expect": "pass" + }, + { + "label": "Dutch example with ë", + "comment": "from issue reported by MicroJackson", + "original": "ongeëvenaard", + "fixed-encoding": "ongeëvenaard", + "fixed": "ongeëvenaard", + "expect": "pass" + }, + { + "label": "HTML entity on top of UTF-8 / Latin-1", + "original": "10μs", + "fixed-encoding": "10μs", + "fixed": "10μs", + "expect": "pass" + }, + { + "label": "Three layers of UTF-8 / MacRoman mixup in French", + "comment": "You're welcome", + "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", + "fixed": "Merci de télécharger le plug-in Flash Player 8", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mixup in French", + "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", + "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", + "expect": "pass" + }, + { + "label": "Italian UTF-8 / MacRoman example with ò", + "original": "Le Vigne di Zam√≤", + "fixed": "Le Vigne di Zamò", + "expect": "pass" + }, + { + "label": "Punctuation pile-up should actually be musical notes", + "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "expect": "pass" + }, + { + "label": "Latvian UTF-8 / Windows-1257 mojibake", + "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", + "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", + "expect": "pass" + }, + { + "label": "Latvian UTF-8 / MacRoman mojibake", + "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", + "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1257 mojibake", + "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", + "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", + "expect": "pass" + }, + { + "label": "Lithuanian UTF-8 / Windows-1250 mojibake", + "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", + "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", + "expect": "pass" + }, + { + "label": "Hebrew UTF-8 / Windows-1252 mojibake", + "comment": "reported by SuperIRabbit as issue #158", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Wide comma in UTF-8 / Windows-1252", + "original": "Ningbo,China", + "fixed-encoding": "Ningbo,China", + "fixed": "Ningbo,China", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/known-failures.json b/tests/test-cases/known-failures.json new file mode 100644 index 0000000..2663d9f --- /dev/null +++ b/tests/test-cases/known-failures.json @@ -0,0 +1,70 @@ +[ + { + "label": "Misleading mix-up in Spanish", + "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", + "original": "tiene demora y está \u0093próximo a resolverse\u0094", + "fixed": "tiene demora y está \"próximo a resolverse\"", + "expect": "fail" + }, + { + "label": "Two levels of inconsistent mojibake", + "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", + "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", + "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", + "expect": "fail" + }, + { + "label": "A-with-grave in Vietnamese", + "comment": "Currently adds extra spaces that shouldn't be there", + "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", + "fixed": "Xem clip hài, phim hài mới hay nhất", + "expect": "fail" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish", + "comment": "Requires something like encoding detection", + "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", + "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", + "expect": "fail" + }, + { + "label": "subtle UTF-8 / codepage 437 mixup in Spanish", + "original": "┬┐que diferencia hay?", + "fixed": "¿que diferencia hay?", + "expect": "fail" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", + "comment": "Requires something like encoding detection", + "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", + "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", + "expect": "fail" + }, + { + "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", + "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", + "fixed": "faites attention à bien vous renseigner avant sur le médicament", + "expect": "fail" + }, + { + "label": "Italian UTF-8 / MacRoman mojibake that looks like math", + "comment": "False negative: 'pi√π' is a bit too reasonable to fix", + "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", + "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", + "expect": "fail" + }, + { + "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", + "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", + "original": "أكثر من Ù Ù Ù¡ بلد", + "fixed": "أكثر من ٠٠١ بلد", + "expect": "fail" + }, + { + "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", + "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", + "original": "MISUTÂ\u00a0AJIKKO", + "fixed": "MISUTÂ\u00a0AJIKKO", + "expect": "fail" + } +] \ No newline at end of file diff --git a/tests/test-cases/language-names.json b/tests/test-cases/language-names.json new file mode 100644 index 0000000..cdb8241 --- /dev/null +++ b/tests/test-cases/language-names.json @@ -0,0 +1,127 @@ +[ + { + "label": "Messy language names: Czech", + "comment": "This and several following examples came from the same language selector", + "original": "ÄŒeÅ¡tina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Messy language names: Gaelic", + "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", + "original": "GÃ\u00a0idhlig", + "fixed": "Gàidhlig", + "expect": "pass" + }, + { + "label": "Messy language names: Lithuanian", + "original": "Lietuvių", + "fixed": "Lietuvių", + "expect": "pass" + }, + { + "label": "Messy language names: Slovak", + "original": "SlovenÄ�ina", + "fixed": "Sloven�ina", + "expect": "pass" + }, + { + "label": "Messy language names: Vietnamese", + "original": "Tiếng Việt", + "fixed": "Tiếng Việt", + "expect": "pass" + }, + { + "label": "Messy language names: Greek", + "original": "Ελληνικά", + "fixed": "Ελληνικά", + "expect": "pass" + }, + { + "label": "Messy language names: Bulgarian", + "original": "българÑ�ки език", + "fixed": "българ�ки език", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Serbian [Cyrillic]", + "original": "CрпÑ�ки [ћирилицом]", + "fixed": "Cрп�ки [ћирилицом]", + "expect": "pass" + }, + { + "label": "Messy language names: Hebrew", + "original": "עברית", + "fixed": "עברית", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Hindi", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "हिनà¥�दी", + "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", + "expect": "pass" + }, + { + "label": "Messy language names: Tamil", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "தமிழà¯�", + "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", + "expect": "pass" + }, + { + "label": "Messy language names: Thai", + "original": "ภาษาไทย", + "fixed": "ภาษาไทย", + "expect": "pass" + }, + { + "label": "Messy language names: Simplified Chinese", + "original": "简体ä¸\u00adæ–‡", + "fixed": "简体中文", + "expect": "pass" + }, + { + "label": "Messy language names: Traditional Chinese", + "original": "æ\u00ad£é«”ä¸\u00adæ–‡", + "fixed": "正體中文", + "expect": "pass" + }, + { + "label": "Messy language names: Japanese", + "original": "日本語", + "fixed": "日本語", + "expect": "pass" + }, + { + "label": "Messy language names: Korean", + "original": "한êµ\u00adì–´", + "fixed": "한국어", + "expect": "pass" + }, + { + "label": "Messy language name in cp437: Czech", + "comment": "A synthetic example, I suppose, but goes with the other language name tests", + "original": "─îe┼ítina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Messy language name in cp437: Vietnamese", + "original": "Tiß║┐ng Viß╗çt", + "fixed": "Tiếng Việt", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/negative.json b/tests/test-cases/negative.json new file mode 100644 index 0000000..dc1e36b --- /dev/null +++ b/tests/test-cases/negative.json @@ -0,0 +1,216 @@ +[ + { + "label": "Negative: Using diaereses as quotation marks in Greek", + "comment": "Examples in this file might be detected as mojibake-like, but should not be changed", + "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "expect": "pass" + }, + { + "label": "Negative: Don't fix a multiplication symbol in quotes", + "original": "higher values (“+” and “×” curves) in the superficial region", + "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", + "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", + "expect": "pass" + }, + { + "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", + "comment": "issue #202", + "original": "Bremer/Mccoy – DrÃ¥ber", + "fixed": "Bremer/Mccoy – DrÃ¥ber", + "expect": "pass" + }, + { + "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", + "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "expect": "pass" + }, + { + "label": "Negative: multiplication sign and ellipsis", + "comment": "Should not turn into a dot below", + "original": "4288×…", + "fixed": "4288×…", + "expect": "pass" + }, + { + "label": "Negative: accents are sometimes used as quotes", + "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", + "original": "``toda produzida pronta pra assa aí´´", + "fixed": "``toda produzida pronta pra assa aí´´", + "expect": "pass" + }, + { + "label": "Negative: 'Õ' followed by an ellipsis", + "comment": "Should not turn into the Armenian letter Յ", + "original": "HUHLL Õ…", + "fixed": "HUHLL Õ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ê' followed by an ellipsis", + "comment": "Should not turn into a squat reversed esh", + "original": "RETWEET SE VOCÊ…", + "fixed": "RETWEET SE VOCÊ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by an ellipsis", + "comment": "Should not turn into 'MARQUɅ'", + "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ó' followed by an ellipsis", + "comment": "Should not turn into 'SӅ'", + "original": "TEM QUE SEGUIR, SDV SÓ…", + "fixed": "TEM QUE SEGUIR, SDV SÓ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by a curly apostrophe", + "comment": "Should not turn into 'ZZAJɒs'", + "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", + "expect": "pass" + }, + { + "label": "Negative: 'é' preceded by curly apostrophe", + "comment": "Should not turn into 'LՎpisode'", + "original": "L’épisode 8 est trop fou ouahh", + "fixed-encoding": "L’épisode 8 est trop fou ouahh", + "fixed": "L'épisode 8 est trop fou ouahh", + "expect": "pass" + }, + { + "label": "Negative: three raised eyebrows or something?", + "comment": "Should not turn into private use character U+F659", + "original": "Ôôô VIDA MINHA", + "fixed": "Ôôô VIDA MINHA", + "expect": "pass" + }, + { + "label": "Negative: copyright sign preceded by non-breaking space", + "comment": "Should not turn into 'ʩ'", + "original": "[x]\u00a0©", + "fixed": "[x]\u00a0©", + "expect": "pass" + }, + { + "label": "Negative: en dash and infinity sign", + "comment": "Should not turn into '2012Ѱ'", + "original": "2012—∞", + "fixed": "2012—∞", + "expect": "pass" + }, + { + "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", + "original": "SENSЕ - Oleg Tsedryk", + "fixed": "SENSЕ - Oleg Tsedryk", + "expect": "pass" + }, + { + "label": "Negative: angry face", + "comment": "The face should not turn into '`«'", + "original": "OK??:( `¬´ ):", + "fixed": "OK??:( `¬´ ):", + "expect": "pass" + }, + { + "label": "Negative, synthetic: face with glasses and a raised eyebrow", + "original": "( o¬ô )", + "fixed": "( o¬ô )", + "expect": "pass" + }, + { + "label": "Negative: triangle and degree sign", + "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", + "original": "∆°", + "fixed": "∆°", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with inverted question mark", + "comment": "Former false positive - it should not turn into 'QUEM ɿ'", + "original": "ESSE CARA AI QUEM É¿", + "fixed": "ESSE CARA AI QUEM É¿", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with acute accents as quotation marks", + "comment": "Former false positive - the end should not turn into a superscript H", + "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "expect": "pass" + }, + { + "label": "Negative: Finnish Ä followed by a non-breaking space", + "comment": "Former false positive - should not become a G with a dot", + "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "expect": "pass" + }, + { + "label": "Negative: multiplying by currency", + "comment": "Former false positive - should not become the Hebrew letter 'final pe'", + "original": "Offering 5×£35 pin ups", + "fixed": "Offering 5×£35 pin ups", + "expect": "pass" + }, + { + "label": "Negative: registered chocolate brand name", + "comment": "Former false positive - should not become the IPA letter 'lezh'", + "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "expect": "pass" + }, + { + "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", + "comment": "Should not become a cedilla", + "original": "Connect with Āø on Facebook", + "fixed": "Connect with Āø on Facebook", + "expect": "pass" + }, + { + "label": "Mostly negative: we only need to fix C1 control characters", + "comment": "We should not decode 'é\u0085 ' as '酠'", + "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", + "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", + "expect": "pass" + }, + { + "label": "Negative: We don't fix à in all contexts", + "original": "C O N C L U S à O", + "fixed": "C O N C L U S à O", + "expect": "pass" + }, + { + "label": "Negative: Two concatenated strings", + "comment": "Should not turn into 'fratarak᧠141'", + "original": "Oborzos, per. Vahbarz, frataraká§ 141", + "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", + "expect": "pass" + }, + { + "label": "Negative: Indonesian leetspeak", + "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "expect": "pass" + }, + { + "label": "Negative: math in Unicode", + "comment": "This isn't mojibake, it's an actual equation", + "original": "(-1/2)! = √π", + "fixed": "(-1/2)! = √π", + "expect": "pass" + }, + { + "label": "Negative: Leet line-art", + "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", + "original": "├┤a┼┐a┼┐a┼┐a┼┐a", + "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test-cases/synthetic.json b/tests/test-cases/synthetic.json new file mode 100644 index 0000000..a939311 --- /dev/null +++ b/tests/test-cases/synthetic.json @@ -0,0 +1,208 @@ +[ + { + "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", + "comment": "Examples in this file were made up to test something, instead of found in the wild", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", + "original": "◊ë◊î◊ï◊ì◊¢◊î", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", + "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", + "original": "×\u0090×\u0091×\u0091×\u0090", + "fixed": "אבבא", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", + "original": "ÿ±ÿ≥ÿߟÑÿ©", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", + "comment": "The original example of why ftfy needs heuristics", + "original": "I'm not such a fan of Charlotte Brontë…”", + "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", + "fixed": "I'm not such a fan of Charlotte Brontë…\"", + "expect": "pass" + }, + { + "label": "Synthetic, negative: hypothetical Swedish product name", + "comment": "This used to be a constructed example of a false positive, until you added another symbol", + "original": "AHÅ™, the new sofa from IKEA", + "fixed": "AHÅ™, the new sofa from IKEA", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Ukrainian capital letters", + "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", + "original": "ВІКІ is Ukrainian for WIKI", + "fixed": "ВІКІ is Ukrainian for WIKI", + "expect": "pass" + }, + { + "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "These control characters \u001a are apparently intentional \u0081", + "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", + "fixed": "These control characters are apparently intentional \u0081", + "expect": "pass" + }, + { + "label": "Synthetic, negative: U+1A on its own", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "Here's a control character: \u001a", + "fixed-encoding": "Here's a control character: \u001a", + "fixed": "Here's a control character: ", + "expect": "pass" + }, + { + "label": "Synthetic, negative: A-with-circle as an Angstrom sign", + "comment": "Should not turn into '10 ŗ'", + "original": "a radius of 10 Å—", + "fixed": "a radius of 10 Å—", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", + "original": "!YO SÉ¡", + "fixed": "!YO SÉ¡", + "expect": "pass" + }, + { + "label": "Synthetic: fix text with backslashes in it", + "comment": "Tests for a regression on a long-ago bug", + "original": "<40\\% vs \u00e2\u0089\u00a540\\%", + "fixed": "<40\\% vs ≥40\\%", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", + "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", + "original": "“mismatched quotes…”", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: lossy decoding in sloppy-windows-1252", + "original": "“lossy decodingâ€�", + "fixed-encoding": "“lossy decoding�", + "fixed": "\"lossy decoding�", + "expect": "pass" + }, + { + "label": "Synthetic: French word for August in windows-1252", + "original": "août", + "fixed-encoding": "août", + "fixed": "août", + "expect": "pass" + }, + { + "label": "Synthetic: French word for hotel in all-caps windows-1252", + "original": "HÔTEL", + "fixed-encoding": "HÔTEL", + "fixed": "HÔTEL", + "expect": "pass" + }, + { + "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", + "original": "CÙIS", + "fixed-encoding": "CÙIS", + "fixed": "CÙIS", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a non-breaking space", + "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", + "original": "NICIODATĂ\u00a0", + "fixed": "NICIODATĂ\u00a0", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Be careful around curly apostrophes", + "comment": "It shouldn't end up saying 'a lot of Òs'", + "original": "There are a lot of Ã’s in mojibake text", + "fixed-encoding": "There are a lot of Ã’s in mojibake text", + "fixed": "There are a lot of Ã's in mojibake text", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a trademark sign", + "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", + "original": "NICIODATĂ™", + "fixed": "NICIODATĂ™", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Lithuanian word before a trademark sign", + "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", + "original": "TRANSFORMATORIŲ™", + "fixed": "TRANSFORMATORIŲ™", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Norwegian capitalized nonsense", + "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", + "original": "HÅØYA ER BLÅØYD", + "fixed": "HÅØYA ER BLÅØYD", + "expect": "pass" + }, + { + "label": "Synthetic, negative: raised eyebrow kaomoji", + "original": "Ō¬o", + "fixed": "Ō¬o", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", + "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", + "original": "ПоздравЂаво", + "fixed": "ПоздравЂаво", + "expect": "pass" + }, + { + "label": "Synthetic: mojibake with trademark sign at the end of a word", + "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", + "original": "OÙ ET QUAND?", + "fixed": "OÙ ET QUAND?", + "expect": "pass" + } +] \ No newline at end of file diff --git a/tests/test_cases.json b/tests/test_cases.json deleted file mode 100644 index 005dab2..0000000 --- a/tests/test_cases.json +++ /dev/null @@ -1,1061 +0,0 @@ -[ - { - "label": "Messy language names: Czech", - "comment": "This and several following examples came from the same language selector", - "original": "ÄŒeÅ¡tina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Messy language names: Gaelic", - "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", - "original": "GÃ\u00a0idhlig", - "fixed": "Gàidhlig", - "expect": "pass" - }, - { - "label": "Messy language names: Lithuanian", - "original": "Lietuvių", - "fixed": "Lietuvių", - "expect": "pass" - }, - { - "label": "Messy language names: Slovak", - "original": "SlovenÄ�ina", - "fixed": "Sloven�ina", - "expect": "pass" - }, - { - "label": "Messy language names: Vietnamese", - "original": "Tiếng Việt", - "fixed": "Tiếng Việt", - "expect": "pass" - }, - { - "label": "Messy language names: Greek", - "original": "Ελληνικά", - "fixed": "Ελληνικά", - "expect": "pass" - }, - { - "label": "Messy language names: Bulgarian", - "original": "българÑ�ки език", - "fixed": "българ�ки език", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Serbian [Cyrillic]", - "original": "CрпÑ�ки [ћирилицом]", - "fixed": "Cрп�ки [ћирилицом]", - "expect": "pass" - }, - { - "label": "Messy language names: Hebrew", - "original": "עברית", - "fixed": "עברית", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Hindi", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "हिनà¥�दी", - "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", - "expect": "pass" - }, - { - "label": "Messy language names: Tamil", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "தமிழà¯�", - "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", - "expect": "pass" - }, - { - "label": "Messy language names: Thai", - "original": "ภาษาไทย", - "fixed": "ภาษาไทย", - "expect": "pass" - }, - { - "label": "Messy language names: Simplified Chinese", - "original": "简体ä¸\u00adæ–‡", - "fixed": "简体中文", - "expect": "pass" - }, - { - "label": "Messy language names: Traditional Chinese", - "original": "æ\u00ad£é«”ä¸\u00adæ–‡", - "fixed": "正體中文", - "expect": "pass" - }, - { - "label": "Messy language names: Japanese", - "original": "日本語", - "fixed": "日本語", - "expect": "pass" - }, - { - "label": "Messy language names: Korean", - "original": "한êµ\u00adì–´", - "fixed": "한국어", - "expect": "pass" - }, - { - "label": "Synthetic: Messy language name in cp437: Czech", - "original": "─îe┼ítina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Synthetic: Messy language name in cp437: Vietnamese", - "original": "Tiß║┐ng Viß╗çt", - "fixed": "Tiếng Việt", - "expect": "pass" - }, - { - "label": "Low-codepoint emoji", - "comment": "From the ancient era before widespread emoji support on Twitter", - "original": "He's Justinâ\u009d¤", - "fixed": "He's Justin❤", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mix-up about smurfs", - "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", - "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", - "expect": "pass" - }, - { - "label": "Checkmark that almost looks okay as mojibake", - "original": "✔ No problems", - "fixed": "✔ No problems", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 Russian mixup about futbol", - "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", - "fixed": "дороге Из-под #футбол", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German", - "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", - "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", - "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup of the replacement character", - "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "expect": "pass" - }, - { - "label": "CESU-8 / Windows-1252 emoji", - "original": "Hi guys í ½í¸\u008d", - "fixed": "Hi guys 😍", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 emoji", - "original": "hihi RT username: â\u0098ºí ½í¸\u0098", - "fixed": "hihi RT username: ☺😘", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Turkish", - "original": "Beta Haber: Hırsızı Büyü Korkuttu", - "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", - "original": "İstanbul", - "fixed": "İstanbul", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", - "original": "RUF MICH ZURÜCK", - "fixed": "RUF MICH ZURÜCK", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", - "original": "RÄ«ga", - "fixed": "Rīga", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixed up twice in Russian", - "original": "приятности. РІСњВ¤", - "fixed": "приятности. ❤", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in Malay", - "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", - "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", - "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", - "original": "Iggy Pop (né Jim Osterberg)", - "fixed": "Iggy Pop (né Jim Osterberg)", - "expect": "pass" - }, - { - "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", - "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", - "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", - "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", - "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", - "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up three times", - "original": "The Mona Lisa doesn’t have eyebrows.", - "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", - "fixed": "The Mona Lisa doesn't have eyebrows.", - "expect": "pass" - }, - { - "label": "UTF-8 / Codepage 437 mixup in Russian", - "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", - "fixed": "#правильноепитание", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in French", - "original": "Hôtel de Police", - "fixed": "Hôtel de Police", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in French", - "original": "Liège Avenue de l'HĂ´pital", - "fixed": "Liège Avenue de l'Hôpital", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Vietnamese", - "original": "Tại sao giá hạt sầu riêng lại lên giá?", - "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", - "expect": "pass" - }, - { - "label": "Negative: using diaereses as quotation marks in Greek", - "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "expect": "pass" - }, - { - "label": "Science! Mid-word Greek letter gets fixed correctly", - "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "expect": "pass" - }, - { - "label": "Negative: More science! Don't fix a multiplication symbol in quotes", - "original": "higher values (“+” and “×” curves) in the superficial region", - "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", - "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", - "expect": "pass" - }, - { - "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", - "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", - "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", - "expect": "pass" - }, - { - "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", - "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", - "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in English", - "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", - "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "UTF-8 / ISO-8859-2 mixup in Czech", - "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", - "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", - "fixed": "Mám dost třetího tisíciletí", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", - "comment": "A difficult test case that can depend on the order that steps are applied", - "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "expect": "pass" - }, - { - "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", - "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", - "original": "أكثر من Ù Ù Ù¡ بلد", - "fixed": "أكثر من ٠٠١ بلد", - "expect": "fail" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", - "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", - "fixed": "vedere înceţoşată", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in Slovak", - "original": "NapĂ\u00adšte nám !", - "fixed": "Napíšte nám !", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Spanish", - "original": "DOS AÑOS", - "fixed": "DOS AÑOS", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", - "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "expect": "pass" - }, - { - "label": "fancy Unicode crossing-out, but mojibaked", - "original": "hotel $49 $̶6̶3̶ updated 2018", - "fixed": "hotel $49 $̶6̶3̶ updated 2018", - "expect": "pass" - }, - { - "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", - "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", - "fixed": "┒(⌣˛⌣)┎", - "expect": "pass" - }, - { - "label": "We can mostly decode the face above when we lose the character U+009D", - "original": "ââ€�’(⌣˛⌣)ââ€�Ž", - "fixed": "�(⌣˛⌣)�", - "expect": "pass" - }, - { - "label": "Lossy decoding can have plain ASCII question marks, as well", - "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", - "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", - "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 mixup over several emoji", - "comment": "You tried", - "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", - "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", - "expect": "pass" - }, - { - "label": "Two levels of inconsistent mojibake", - "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", - "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", - "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", - "expect": "fail" - }, - { - "label": "An absolutely hopeless garble", - "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", - "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent mojibake in Portuguese", - "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "expect": "pass" - }, - { - "label": "Handle Afrikaans 'n character", - "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", - "expect": "pass" - }, - { - "label": "Handle Croatian single-codepoint digraphs", - "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "expect": "pass" - }, - { - "label": "A with an acute accent, in isolation", - "original": "Nicolás", - "fixed": "Nicolás", - "expect": "pass" - }, - { - "label": "sharp S, in isolation, via MacRoman encoding", - "comment": "regression reported in issue #186", - "original": "wei√ü", - "fixed": "weiß", - "expect": "pass" - }, - { - "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", - "comment": "issue #202", - "original": "Bremer/Mccoy – DrÃ¥ber", - "fixed": "Bremer/Mccoy – DrÃ¥ber", - "expect": "pass" - }, - { - "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", - "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "expect": "pass" - }, - { - "label": "Negative: multiplication sign and ellipsis", - "comment": "Should not turn into a dot below", - "original": "4288×…", - "fixed": "4288×…", - "expect": "pass" - }, - { - "label": "Negative: accents are sometimes used as quotes", - "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", - "original": "``toda produzida pronta pra assa aí´´", - "fixed": "``toda produzida pronta pra assa aí´´", - "expect": "pass" - }, - { - "label": "Negative: 'Õ' followed by an ellipsis", - "comment": "Should not turn into the Armenian letter Յ", - "original": "HUHLL Õ…", - "fixed": "HUHLL Õ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ê' followed by an ellipsis", - "comment": "Should not turn into a squat reversed esh", - "original": "RETWEET SE VOCÊ…", - "fixed": "RETWEET SE VOCÊ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by an ellipsis", - "comment": "Should not turn into 'MARQUɅ'", - "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ó' followed by an ellipsis", - "comment": "Should not turn into 'SӅ'", - "original": "TEM QUE SEGUIR, SDV SÓ…", - "fixed": "TEM QUE SEGUIR, SDV SÓ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by a curly apostrophe", - "comment": "Should not turn into 'ZZAJɒs'", - "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", - "expect": "pass" - }, - { - "label": "Negative: 'é' preceded by curly apostrophe", - "comment": "Should not turn into 'LՎpisode'", - "original": "L’épisode 8 est trop fou ouahh", - "fixed-encoding": "L’épisode 8 est trop fou ouahh", - "fixed": "L'épisode 8 est trop fou ouahh", - "expect": "pass" - }, - { - "label": "Negative: three raised eyebrows or something?", - "comment": "Should not turn into private use character U+F659", - "original": "Ôôô VIDA MINHA", - "fixed": "Ôôô VIDA MINHA", - "expect": "pass" - }, - { - "label": "Negative: copyright sign preceded by non-breaking space", - "comment": "Should not turn into 'ʩ'", - "original": "[x]\u00a0©", - "fixed": "[x]\u00a0©", - "expect": "pass" - }, - { - "label": "Negative: en dash and infinity sign", - "comment": "Should not turn into '2012Ѱ'", - "original": "2012—∞", - "fixed": "2012—∞", - "expect": "pass" - }, - { - "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", - "original": "SENSЕ - Oleg Tsedryk", - "fixed": "SENSЕ - Oleg Tsedryk", - "expect": "pass" - }, - { - "label": "Negative: angry face", - "comment": "The face should not turn into '`«'", - "original": "OK??:( `¬´ ):", - "fixed": "OK??:( `¬´ ):", - "expect": "pass" - }, - { - "label": "Negative, synthetic: face with glasses and a raised eyebrow", - "original": "( o¬ô )", - "fixed": "( o¬ô )", - "expect": "pass" - }, - { - "label": "Negative: triangle and degree sign", - "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", - "original": "∆°", - "fixed": "∆°", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with inverted question mark", - "comment": "Former false positive - it should not turn into 'QUEM ɿ'", - "original": "ESSE CARA AI QUEM É¿", - "fixed": "ESSE CARA AI QUEM É¿", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with acute accents as quotation marks", - "comment": "Former false positive - the end should not turn into a superscript H", - "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "expect": "pass" - }, - { - "label": "Negative: Finnish Ä followed by a non-breaking space", - "comment": "Former false positive - should not become a G with a dot", - "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "expect": "pass" - }, - { - "label": "Negative: multiplying by currency", - "comment": "Former false positive - should not become the Hebrew letter 'final pe'", - "original": "Offering 5×£35 pin ups", - "fixed": "Offering 5×£35 pin ups", - "expect": "pass" - }, - { - "label": "Negative: registered chocolate brand name", - "comment": "Former false positive - should not become the IPA letter 'lezh'", - "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "expect": "pass" - }, - { - "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", - "comment": "Should not become a cedilla", - "original": "Connect with Āø on Facebook", - "fixed": "Connect with Āø on Facebook", - "expect": "pass" - }, - { - "label": "Mostly negative: we only need to fix C1 control characters", - "comment": "We should not decode 'é\u0085 ' as '酠'", - "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", - "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", - "expect": "pass" - }, - { - "label": "French example containing non-breaking spaces", - "original": "ART TRIP Ã\u00a0 l'office de tourisme", - "fixed": "ART TRIP à l'office de tourisme", - "expect": "pass" - }, - { - "label": "English example in UTF-8 / Windows-1251 with a ligature", - "original": "This is signiп¬Ѓcantly lower than the respective share", - "fixed-encoding": "This is significantly lower than the respective share", - "fixed": "This is significantly lower than the respective share", - "expect": "pass" - }, - { - "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Negative: We don't fix à in all contexts", - "original": "C O N C L U S à O", - "fixed": "C O N C L U S à O", - "expect": "pass" - }, - { - "label": "'à' remains its own word, even if spaces after it get coalesced into one", - "original": "à perturber la réflexion des théologiens jusqu'à nos jours", - "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", - "expect": "pass" - }, - { - "label": "Fix 'à' in inconsistent mojibake", - "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", - "expect": "pass" - }, - { - "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", - "original": "com especial atenção à s crianças", - "fixed": "com especial atenção às crianças", - "expect": "pass" - }, - { - "label": "This is why we require a space after the 's' in 'às'", - "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "expect": "pass" - }, - { - "label": "We can fix 'à' in windows-1251 sometimes as well", - "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", - "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", - "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", - "expect": "pass" - }, - { - "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", - "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", - "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", - "expect": "pass" - }, - { - "label": "A complex, lossy pile-up of mojibake in Portuguese", - "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", - "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", - "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", - "fixed": "Cànan nan Gàidheal", - "expect": "pass" - }, - { - "label": "Misleading mix-up in Spanish", - "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", - "original": "tiene demora y está \u0093próximo a resolverse\u0094", - "fixed": "tiene demora y está \"próximo a resolverse\"", - "expect": "fail" - }, - { - "label": "A-with-grave in Vietnamese", - "comment": "Currently adds extra spaces that shouldn't be there", - "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", - "fixed": "Xem clip hài, phim hài mới hay nhất", - "expect": "fail" - }, - { - "label": "Punctuation pile-up should actually be musical notes", - "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "expect": "pass" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish", - "comment": "Requires something like encoding detection", - "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", - "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", - "expect": "fail" - }, - { - "label": "subtle UTF-8 / codepage 437 mixup in Spanish", - "original": "┬┐que diferencia hay?", - "fixed": "¿que diferencia hay?", - "expect": "fail" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", - "comment": "Requires something like encoding detection", - "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", - "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", - "expect": "fail" - }, - { - "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", - "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", - "fixed": "faites attention à bien vous renseigner avant sur le médicament", - "expect": "fail" - }, - { - "label": "UTF-8 / Windows-1251 mixup in tweet spam", - "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixup", - "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", - "expect": "pass" - }, - { - "label": "Dutch example with ë", - "comment": "from issue reported by MicroJackson", - "original": "ongeëvenaard", - "fixed-encoding": "ongeëvenaard", - "fixed": "ongeëvenaard", - "expect": "pass" - }, - { - "label": "HTML entity on top of UTF-8 / Latin-1", - "original": "10μs", - "fixed-encoding": "10μs", - "fixed": "10μs", - "expect": "pass" - }, - { - "label": "Negative: Two concatenated strings", - "comment": "Should not turn into 'fratarak᧠141'", - "original": "Oborzos, per. Vahbarz, frataraká§ 141", - "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", - "expect": "pass" - }, - { - "label": "Negative: Indonesian leetspeak", - "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "expect": "pass" - }, - { - "label": "Three layers of UTF-8 / MacRoman mixup in French", - "comment": "You're welcome", - "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", - "fixed": "Merci de télécharger le plug-in Flash Player 8", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mixup in French", - "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", - "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", - "expect": "pass" - }, - { - "label": "Italian UTF-8 / MacRoman example with ò", - "original": "Le Vigne di Zam√≤", - "fixed": "Le Vigne di Zamò", - "expect": "pass" - }, - { - "label": "Italian UTF-8 / MacRoman mojibake that looks like math", - "comment": "False negative: 'pi√π' is a bit too reasonable to fix", - "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", - "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", - "expect": "fail" - }, - { - "label": "Latvian UTF-8 / Windows-1257 mojibake", - "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", - "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", - "expect": "pass" - }, - { - "label": "Latvian UTF-8 / MacRoman mojibake", - "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", - "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1257 mojibake", - "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", - "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1250 mojibake", - "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", - "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", - "expect": "pass" - }, - { - "label": "Hebrew UTF-8 / Windows-1252 mojibake", - "comment": "reported by SuperIRabbit as issue #158", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Wide comma in UTF-8 / Windows-1252", - "original": "Ningbo,China", - "fixed-encoding": "Ningbo,China", - "fixed": "Ningbo,China", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", - "original": "◊ë◊î◊ï◊ì◊¢◊î", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", - "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", - "original": "×\u0090×\u0091×\u0091×\u0090", - "fixed": "אבבא", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", - "original": "ÿ±ÿ≥ÿߟÑÿ©", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Negative: math in Unicode", - "comment": "This isn't mojibake, it's an actual equation", - "original": "(-1/2)! = √π", - "fixed": "(-1/2)! = √π", - "expect": "pass" - }, - { - "label": "Negative: Leet line-art", - "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", - "original": "├┤a┼┐a┼┐a┼┐a┼┐a", - "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", - "comment": "The original example of why ftfy needs heuristics", - "original": "I'm not such a fan of Charlotte Brontë…”", - "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", - "fixed": "I'm not such a fan of Charlotte Brontë…\"", - "expect": "pass" - }, - { - "label": "Synthetic, negative: hypothetical Swedish product name", - "comment": "This used to be a constructed example of a false positive, until you added another symbol", - "original": "AHÅ™, the new sofa from IKEA", - "fixed": "AHÅ™, the new sofa from IKEA", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Ukrainian capital letters", - "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", - "original": "ВІКІ is Ukrainian for WIKI", - "fixed": "ВІКІ is Ukrainian for WIKI", - "expect": "pass" - }, - { - "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "These control characters \u001a are apparently intentional \u0081", - "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", - "fixed": "These control characters are apparently intentional \u0081", - "expect": "pass" - }, - { - "label": "Synthetic, negative: U+1A on its own", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "Here's a control character: \u001a", - "fixed-encoding": "Here's a control character: \u001a", - "fixed": "Here's a control character: ", - "expect": "pass" - }, - { - "label": "Synthetic, negative: A-with-circle as an Angstrom sign", - "comment": "Should not turn into '10 ŗ'", - "original": "a radius of 10 Å—", - "fixed": "a radius of 10 Å—", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", - "original": "!YO SÉ¡", - "fixed": "!YO SÉ¡", - "expect": "pass" - }, - { - "label": "Synthetic: fix text with backslashes in it", - "comment": "Tests for a regression on a long-ago bug", - "original": "<40\\% vs \u00e2\u0089\u00a540\\%", - "fixed": "<40\\% vs ≥40\\%", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", - "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", - "original": "“mismatched quotes…”", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: lossy decoding in sloppy-windows-1252", - "original": "“lossy decodingâ€�", - "fixed-encoding": "“lossy decoding�", - "fixed": "\"lossy decoding�", - "expect": "pass" - }, - { - "label": "Synthetic: French word for August in windows-1252", - "original": "août", - "fixed-encoding": "août", - "fixed": "août", - "expect": "pass" - }, - { - "label": "Synthetic: French word for hotel in all-caps windows-1252", - "original": "HÔTEL", - "fixed-encoding": "HÔTEL", - "fixed": "HÔTEL", - "expect": "pass" - }, - { - "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", - "original": "CÙIS", - "fixed-encoding": "CÙIS", - "fixed": "CÙIS", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a non-breaking space", - "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", - "original": "NICIODATĂ\u00a0", - "fixed": "NICIODATĂ\u00a0", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Be careful around curly apostrophes", - "comment": "It shouldn't end up saying 'a lot of Òs'", - "original": "There are a lot of Ã’s in mojibake text", - "fixed-encoding": "There are a lot of Ã’s in mojibake text", - "fixed": "There are a lot of Ã's in mojibake text", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a trademark sign", - "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", - "original": "NICIODATĂ™", - "fixed": "NICIODATĂ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Lithuanian word before a trademark sign", - "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", - "original": "TRANSFORMATORIŲ™", - "fixed": "TRANSFORMATORIŲ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Norwegian capitalized sentence", - "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", - "original": "HÅØYA ER BLÅØYD", - "fixed": "HÅØYA ER BLÅØYD", - "expect": "pass" - }, - { - "label": "Synthetic, negative: raised eyebrow kaomoji", - "original": "Ō¬o", - "fixed": "Ō¬o", - "expect": "pass" - }, - { - "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", - "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", - "original": "MISUTÂ\u00a0AJIKKO", - "fixed": "MISUTÂ\u00a0AJIKKO", - "expect": "fail" - }, - { - "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", - "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", - "original": "ПоздравЂаво", - "fixed": "ПоздравЂаво", - "expect": "pass" - }, - { - "label": "Synthetic: mojibake with trademark sign at the end of a word", - "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", - "original": "OÙ ET QUAND?", - "fixed": "OÙ ET QUAND?", - "expect": "pass" - } -] \ No newline at end of file From 8f8ee2c41b69157ac1dc1e46b9135da4eb3a03e9 Mon Sep 17 00:00:00 2001 From: arborelia Date: Wed, 30 Oct 2024 16:45:36 -0400 Subject: [PATCH 33/34] more ruff checks --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13aeee9..130dec2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ line-length = 100 target-version = "py39" [tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH"] +select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH", "FURB"] ignore = [ "ANN101", "ANN401", From 74dd0452b48286a3770013b3a02755313bd5575e Mon Sep 17 00:00:00 2001 From: Elia Robyn Lake Date: Wed, 30 Oct 2024 17:00:25 -0400 Subject: [PATCH 34/34] load test data from a directory --- tests/test-cases/README.md | 20 ++++++++++++++++++++ tests/test_examples_in_json.py | 13 +++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 tests/test-cases/README.md diff --git a/tests/test-cases/README.md b/tests/test-cases/README.md new file mode 100644 index 0000000..673bd5f --- /dev/null +++ b/tests/test-cases/README.md @@ -0,0 +1,20 @@ +# ftfy test cases + +This directory contains JSON files with test cases for ftfy. Many of them are real mojibake found in the wild, such as by listening to the Twitter firehose (when that existed), searching through the OSCAR web crawl, or in issue reports from users. + +Cases labeled "synthetic" were not found in the wild, but were instead constructed to test a particular edge case. + +Cases labeled "negative" are not mojibake but look lke they could be. We're testing that ftfy does not alter the text (except for its usual processing such as un-curling quotes). + +`known-failures.json` contains cases that we would do better at with an improved heuristic. Most of these are false negatives, where ftfy does not figure out how to fix the text. ftfy aims to have no false positives, but there is one synthetic false positive in `known-failures.json`. + +## Structure of a test case + +A test case contains the following fields: + +- `label`: A description of the test case, shown when pytest runs in verbose mode. +- `comment`: Further details on the test case because JSON doesn't have comments. +- `original`: The text to run through ftfy. +- `fixed-encoding` (optional): the expected result of `ftfy.fix_encoding(original)`. If unspecified, uses the value from `fixed`. +- `fixed`: the expected result of `ftfy.fix_text(original)`. +- `expect`: "pass" for test cases that should pass, or "fail" for known failures. \ No newline at end of file diff --git a/tests/test_examples_in_json.py b/tests/test_examples_in_json.py index 83dcb8e..2be9eb4 100644 --- a/tests/test_examples_in_json.py +++ b/tests/test_examples_in_json.py @@ -32,8 +32,17 @@ from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text THIS_DIR = Path(__file__).parent -TEST_FILENAME = THIS_DIR / "test_cases.json" -TEST_DATA = json.load(TEST_FILENAME.open(encoding="utf-8")) +TEST_CASE_DIR = THIS_DIR / "test-cases" + + +def load_test_data() -> list[dict]: + test_data = [] + for filepath in TEST_CASE_DIR.glob("*.json"): + test_data.extend(json.load(filepath.open())) + return test_data + + +TEST_DATA = load_test_data() TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"]