diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index ede12d90..00000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: Publish Python distribution 📦 to PyPI - -on: - push: - tags: - - "v*" - -jobs: - build: - name: Build distribution 📦 - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - name: Install pypa/build - run: >- - python3 -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: python3 -m build - - name: Store the distribution packages - uses: actions/upload-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - publish-to-pypi: - name: >- - Publish Python distribution 📦 to PyPI - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes - needs: - - build - runs-on: ubuntu-latest - environment: - name: pypi - url: https://pypi.org/p/ftfy - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - - github-release: - name: >- - Sign the Python distribution 📦 with Sigstore - and upload them to GitHub Release - needs: - - publish-to-pypi - runs-on: ubuntu-latest - - permissions: - contents: write # IMPORTANT: mandatory for making GitHub Releases - id-token: write # IMPORTANT: mandatory for sigstore - - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v3.0.0 - with: - inputs: >- - ./dist/*.tar.gz - ./dist/*.whl - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - run: >- - gh release create - '${{ github.ref_name }}' - --repo '${{ github.repository }}' - --notes "" - - name: Upload artifact signatures to GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - # Upload to GitHub Release using the `gh` CLI. - # `dist/` contains the built packages, and the - # sigstore-produced signatures and certificates. - run: >- - gh release upload - '${{ github.ref_name }}' dist/** - --repo '${{ github.repository }}' diff --git a/.gitignore b/.gitignore index a05d30d0..f9a3c19a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,3 @@ twitterlogs .pytest_cache .tox specimens -.vscode -.python-version diff --git a/.mailmap b/.mailmap index 7897e2d1..c0a85804 100644 --- a/.mailmap +++ b/.mailmap @@ -1,4 +1,6 @@ # Robyn has used different names and e-mail addresses in the course of this project. Map them all to her current name and e-mail. -Robyn Speer -Robyn Speer -Robyn Speer +Robyn Speer +Robyn Speer +Robyn Speer +Robyn Speer + diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6fe1099d..fbabfe31 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,23 +1,11 @@ -# .readthedocs.yaml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-24.04 - tools: - python: "3.11" - commands: - - asdf plugin add uv - - asdf install uv latest - - asdf global uv latest - - uv venv - - uv sync - - .venv/bin/python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html - -# Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/conf.py + configuration: docs/conf.py + +python: + version: 3.8 + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/CHANGELOG.md b/CHANGELOG.md index 28f6c207..9cdcc0bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,82 +1,3 @@ -## Version 6.3.1 (October 25, 2024) - -- Fixed `license` metadata field in pyproject.toml. -- Removed extraneous files from the `hatchling` sdist output. - -## Version 6.3.0 (October 8, 2024) - -- Switched packaging from poetry to uv. -- Uses modern Python packaging exclusively (no setup.py). -- Added support for mojibake in Windows-1257 (Baltic). -- Detects mojibake for "Ü" in an uppercase word, such as "ZURÜCK". -- Expanded a heuristic that notices improbable punctuation. -- Fixed a false positive involving two concatenated strings, one of which began with the § sign. -- Rewrote `chardata.py` to be more human-readable and debuggable, instead of being full of - keysmash-like character sets. - -## Version 6.2.3 (August 5, 2024) - -- Updated PyPI metadata. - -## Version 6.2.2 (August 5, 2024) - -- Updated Read the Docs config so that docs might build again. - -## Version 6.2.1 (August 5, 2024) - -- Updated setup.py and tox.ini to indicate support for Python 3.8 through 3.13. -- Replaced the text file used in CLI tests with a better one that tests the same issue. -- Lints and auto-formatting using ruff. -- Packaging and test fixes by Michał Górny. - -## Version 6.2.0 (March 15, 2024) - -- Fixed a case where an en-dash and a space near other mojibake would be - interpreted (probably incorrectly) as MacRoman mojibake. -- Added [project.urls] metadata to pyproject.toml. -- README contains license clarifications for entitled jerks. - -## Version 6.1.3 (November 21, 2023) - -- Updated wcwidth. -- Switched to the Apache 2.0 license. -- Dropped support for Python 3.7. - -## Version 6.1.2 (February 17, 2022) - -- Added type information for `guess_bytes`. - -## Version 6.1.1 (February 9, 2022) - -- Updated the heuristic to fix the letter ß in UTF-8/MacRoman mojibake, - which had regressed since version 5.6. - -- Packaging fixes to pyproject.toml. - -## Version 6.1 (February 9, 2022) - -- Updated the heuristic to fix the letter Ñ with more confidence. - -- Fixed type annotations and added py.typed. - -- ftfy is packaged using Poetry now, and wheels are created and uploaded to - PyPI. - -## Version 6.0.3 (May 14, 2021) - -- Allow the keyword argument `fix_entities` as a deprecated alias for - `unescape_html`, raising a warning. - -- `ftfy.formatting` functions now disregard ANSI terminal escapes when - calculating text width. - - -## Version 6.0.2 (May 4, 2021) - -This version is purely a cosmetic change, updating the maintainer's e-mail -address and the project's canonical location on GitHub. - - ## Version 6.0.1 (April 12, 2021) - The `remove_terminal_escapes` step was accidentally not being used. This diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 00000000..07a1a7f1 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,4 @@ +wheelJob( + test: 'pytest', + upstream: [ 'wheelhouse-init' ] +) diff --git a/LICENSE.txt b/LICENSE.txt index 275a4299..cb5ed208 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,13 +1,20 @@ -Copyright 2023 Robyn Speer +Copyright (C) 2013-2018 Robyn Speer (rspeer@luminoso.com) +MIT License -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: - http://www.apache.org/licenses/LICENSE-2.0 +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index a138cfc6..f6bd747e 100644 --- a/README.md +++ b/README.md @@ -4,24 +4,23 @@ [![Docs](https://readthedocs.org/projects/ftfy/badge/?version=latest)](https://ftfy.readthedocs.org/en/latest/) ```python - ->>> from ftfy import fix_encoding >>> print(fix_encoding("(ง'⌣')ง")) (ง'⌣')ง - ``` -The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are some links into it: +The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are +some links into it: + +- [Fixing problems and getting explanations](https://ftfy.readthedocs.io/en/v6.0.1/explain.html) +- [Configuring ftfy](https://ftfy.readthedocs.io/en/v6.0.1/config.html) +- [Encodings ftfy can handle](https://ftfy.readthedocs.io/en/v6.0.1/encodings.html) +- [“Fixer” functions](https://ftfy.readthedocs.io/en/v6.0.1/fixes.html) +- [Is ftfy an encoding detector?](https://ftfy.readthedocs.io/en/v6.0.1/detect.html) +- [Heuristics for detecting mojibake](https://ftfy.readthedocs.io/en/v6.0.1/heuristics.html) +- [Support for “bad” encodings](https://ftfy.readthedocs.io/en/v6.0.1/bad_encodings.html) +- [Command-line usage](https://ftfy.readthedocs.io/en/v6.0.1/cli.html) +- [Citing ftfy](https://ftfy.readthedocs.io/en/v6.0.1/cite.html) -- [Fixing problems and getting explanations](https://ftfy.readthedocs.io/en/latest/explain.html) -- [Configuring ftfy](https://ftfy.readthedocs.io/en/latest/config.html) -- [Encodings ftfy can handle](https://ftfy.readthedocs.io/en/latest/encodings.html) -- [“Fixer” functions](https://ftfy.readthedocs.io/en/latest/fixes.html) -- [Is ftfy an encoding detector?](https://ftfy.readthedocs.io/en/latest/detect.html) -- [Heuristics for detecting mojibake](https://ftfy.readthedocs.io/en/latest/heuristic.html) -- [Support for “bad” encodings](https://ftfy.readthedocs.io/en/latest/bad_encodings.html) -- [Command-line usage](https://ftfy.readthedocs.io/en/latest/cli.html) -- [Citing ftfy](https://ftfy.readthedocs.io/en/latest/cite.html) ## Testimonials @@ -35,6 +34,8 @@ The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ft — Brennan Young - “I have no idea when I’m gonna need this, but I’m definitely bookmarking it.” — [/u/ocrow](https://reddit.com/u/ocrow) +- “9.2/10” + — [pylint](https://bitbucket.org/logilab/pylint/) ## What it does @@ -78,32 +79,38 @@ The following text could be encoded in Windows-1252 and decoded in UTF-8, and it >>> ftfy.fix_text('IL Y MARQUÉ…') 'IL Y MARQUÉ…' + ## Installing -ftfy is a Python 3 package that can be installed using `pip` or `uv pip`: +ftfy is a Python 3 package that can be installed using `pip`: pip install ftfy -(Or use `pip3 install ftfy` on systems where Python 2 and 3 are both globally installed and `pip` refers to Python 2.) - -If you use `poetry`, you can use ftfy as a dependency in the usual way (such as `poetry add ftfy`). +(Or use `pip3 install ftfy` on systems where Python 2 and 3 are both globally +installed and `pip` refers to Python 2.) -### Local development +You can also clone this Git repository and install it with +`python setup.py install`. -ftfy is developed using [uv](https://github.com/astral-sh/uv). You can build a virtual environment with its local dependencies by running `uv venv`, and test it with `uv run pytest`. ## Who maintains ftfy? -I'm Robyn Speer, also known as Elia Robyn Lake. You can find my projects -[on GitHub](https://github.com/rspeer) and my posts on [my own blog](https://posts.arborelia.net). +I'm Robyn Speer. You can find me [on GitHub](https://github.com/rspeer). +I created ftfy as part of my work at the text understanding company +[Luminoso](https://luminoso.com). + ## Citing ftfy ftfy has been used as a crucial data processing step in major NLP research. -It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it. +It's important to give credit appropriately to everyone whose work you build on +in research. This includes software, not just high-status contributions such as +mathematical models. All I ask when you use ftfy for research is that you cite +it. -ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). A citation of ftfy may look like this: +ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). +A citation of ftfy may look like this: Robyn Speer. (2019). ftfy (Version 5.5). Zenodo. http://doi.org/10.5281/zenodo.2591652 @@ -120,15 +127,3 @@ In BibTeX format, the citation is:: url = {https://doi.org/10.5281/zenodo.2591652} } -## Important license clarifications - -If you do not follow ftfy's license, you do not have a license to ftfy. - -This sounds obvious and tautological, but there are people who think open source licenses mean that they can just do what they want, especially in the field of generative AI. It's a permissive license but you still have to follow it. The [Apache license](https://www.apache.org/licenses/LICENSE-2.0) is the only thing that gives you permission to use and copy ftfy; otherwise, all rights are reserved. - -If you use or distribute ftfy, you must follow the terms of the [Apache license](https://www.apache.org/licenses/LICENSE-2.0), including that you must attribute the author of ftfy (Robyn Speer) correctly. - -You _may not_ make a derived work of ftfy that obscures its authorship, such as by putting its code in an AI training dataset, including the code in AI training at runtime, or using a generative AI that copies code from such a dataset. - -At my discretion, I may notify you of a license violation, and give you a chance to either remedy it or delete all copies of ftfy in your possession. - diff --git a/docs/conf.py b/docs/conf.py index 0aef57bd..2f98f740 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # ftfy documentation build configuration file, created by # sphinx-quickstart on Wed Aug 28 03:18:27 2013. @@ -10,83 +11,84 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) +#sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' +#needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = ".rst" +source_suffix = '.rst' # The encoding of source files. -# source_encoding = 'utf-8-sig' +#source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = "index" +master_doc = 'index' # General information about the project. -project = "ftfy" -copyright = "2024, Robyn Speer" +project = u'ftfy' +copyright = u'2021, Robyn Speer' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = "6.3" +version = '6.0' # The full version, including alpha/beta/rc tags. -release = "6.3.1" +release = '6.0.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# language = None +#language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -# today = '' +#today = '' # Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' +#today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build"] +exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all documents. -default_role = "code" +default_role = 'code' # If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True +#add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -# add_module_names = True +#add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -# show_authors = False +#show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = "default" -pygments_dark_style = "monokai" +pygments_style = 'default' +pygments_dark_style = 'monokai' # A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] +#modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False +#keep_warnings = False # -- Options for HTML output --------------------------------------------------- @@ -105,6 +107,7 @@ "font-stack": "Source Sans Pro, sans-serif", "font-stack--monospace": "Inconsolata", "code-font-size": "18px", + # I don't know why furo wants inline code to be so small, but don't let it "font-size--small--2": "100%", }, @@ -114,15 +117,16 @@ "font-stack": "Source Sans Pro, sans-serif", "font-stack--monospace": "Inconsolata", "code-font-size": "18px", + "font-size--small--2": "100%", }, } html_css_files = [ - "css/custom.css", + 'css/custom.css', ] # Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] +#html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -133,72 +137,74 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. -# html_logo = None +#html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -# html_favicon = None +#html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] +html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = "%b %d, %Y" +html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -# html_use_smartypants = True +#html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -# html_sidebars = {} +#html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -# html_additional_pages = {} +#html_additional_pages = {} # If false, no module index is generated. -# html_domain_indices = True +#html_domain_indices = True # If false, no index is generated. -# html_use_index = True +#html_use_index = True # If true, the index is split into individual pages for each letter. -# html_split_index = False +#html_split_index = False # If true, links to the reST sources are added to the pages. html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True +#html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True +#html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -# html_use_opensearch = '' +#html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None +#html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = "ftfydoc" +htmlhelp_basename = 'ftfydoc' # -- Options for LaTeX output -------------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # 'preamble': '', +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples @@ -207,33 +213,35 @@ # The name of an image file (relative to this directory) to place at the top of # the title page. -# latex_logo = None +#latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -# latex_use_parts = False +#latex_use_parts = False # If true, show page references after internal links. -# latex_show_pagerefs = False +#latex_show_pagerefs = False # If true, show URL addresses after external links. -# latex_show_urls = False +#latex_show_urls = False # Documents to append as an appendix to all manuals. -# latex_appendices = [] +#latex_appendices = [] # If false, no module index is generated. -# latex_domain_indices = True +#latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [("index", "ftfy", "ftfy Documentation", ["Robyn Speer"], 1)] +man_pages = [ + ('index', 'ftfy', 'ftfy Documentation', ['Robyn Speer'], 1) +] # If true, show URL addresses after external links. -# man_show_urls = False +#man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -244,13 +252,13 @@ texinfo_documents = [] # Documents to append as an appendix to all manuals. -# texinfo_appendices = [] +#texinfo_appendices = [] # If false, no module index is generated. -# texinfo_domain_indices = True +#texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' +#texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False +#texinfo_no_detailmenu = False diff --git a/docs/config.rst b/docs/config.rst index 6e3d5217..ce3057ff 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -7,7 +7,7 @@ The main functions of ftfy -- :func:`ftfy.fix_text` and :func:`ftfy.fix_and_expl All the fixes are on by default, but you can pass in a configuration object or keyword options to turn them off. Check that the default fixes are appropriate for your use case. For example: -- You should set `unescape_html` to False if the output is meant to be interpreted as HTML. +- You should set `fix_entities` to False if the output is meant to be interpreted as HTML. - You should set `fix_character_width` to False if you want to preserve the spacing of CJK text. @@ -28,4 +28,4 @@ The top-level functions of ftfy take a `config` argument that is an instance of Keyword arguments ----------------- -The top-level functions also accept keyword arguments in place of a `config` argument. Given these keyword arguments, they will pass them to the :class:`ftfy.TextFixerConfig` constructor, overriding the default values of those configuration options. +The top-level functions also accept keyword arguments in place of a `config` argument. Given these keyword arguments, they will pass them to the :class:`ftfy.TextFixerConfig` constructor, overriding the default values of those configuration options. \ No newline at end of file diff --git a/docs/encodings.rst b/docs/encodings.rst index b0513a78..13a892f4 100644 --- a/docs/encodings.rst +++ b/docs/encodings.rst @@ -6,15 +6,14 @@ ftfy can't fix all possible mix-ups. Its goal is to cover the most common encodi ftfy can understand text that was decoded as any of these single-byte encodings: - Latin-1 (ISO-8859-1) -- Windows-1250 (cp1250 -- used in Microsoft products in Eastern Europe) -- Windows-1251 (cp1251 -- used in Microsoft products in Russia) -- Windows-1252 (cp1252 -- used in Microsoft products in Western Europe and the Americas) -- Windows-1253 (cp1253 -- used in Microsoft products in Greece) -- Windows-1254 (cp1254 -- used in Microsoft products in Türkiye) -- Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries) +- Windows-1252 (cp1252 -- used in Microsoft products) +- Windows-1251 (cp1251 -- the Russian version of cp1252) +- Windows-1250 (cp1250 -- the Eastern European version of cp1252) +- Windows-1253 (cp1253 -- the Greek version of cp1252) +- Windows-1254 (cp1254 -- the Turkish version of cp1252) - ISO-8859-2 (which is not quite the same as Windows-1250) - MacRoman (used on Mac OS 9 and earlier) -- cp437 (it's the "text mode" in your video card firmware) +- cp437 (used in MS-DOS and some versions of the Windows command prompt) when it was actually intended to be decoded as one of these variable-length encodings: @@ -27,8 +26,6 @@ However, ftfy cannot understand other mixups between single-byte encodings, beca We also can't handle the legacy encodings used for Chinese, Japanese, and Korean, such as ``shift-jis`` and ``gb18030``. See `issue #34`_ for why this is so hard. -I tried adding support for cp850, the cp437-workalike that supported European languages, but I couldn't find any real examples that it fixed, and it introduced some false positives. - -.. _`issue #34`: https://github.com/rspeer/python-ftfy/issues/34 +.. _`issue #34`: https://github.com/LuminosoInsight/python-ftfy/issues/34 Remember that the input to ftfy is Unicode, so it handles actual CJK *text* just fine. It just can't discover that a CJK *encoding* introduced mojibake into the text. diff --git a/docs/index.rst b/docs/index.rst index 1ba7ed00..18082d8c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,12 +1,7 @@ ftfy: fixes text for you ======================== -*Version 6.3* - -“Assume all external input is the result of (a series of) bugs.” -— `RFC 9225`_: Software Defects Considered Harmful - -.. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html +*Version 6.0.1* **ftfy** fixes Unicode that's broken in various ways. @@ -16,7 +11,7 @@ This is different from taking in non-Unicode and outputting Unicode, which is no Of course you're better off if your input is decoded properly and has no glitches. But you often don't have any control over your input; it's someone else's mistake, but it's your problem now. ftfy will do everything it can to fix the problem. -ftfy is a heuristic that was designed (not machine-learned) by Robyn Speer. If you use ftfy in research, including pre-processing your language model data, you need to cite it: see :ref:`cite`. +ftfy is a heuristic that was designed (not machine-learned) by Robyn Speer, at Luminoso. If you use ftfy in research, including pre-processing your language model data, you need to cite it: see :ref:`cite`. .. toctree:: :maxdepth: 1 diff --git a/ftfy/__init__.py b/ftfy/__init__.py index fb666989..2c8be4f8 100644 --- a/ftfy/__init__.py +++ b/ftfy/__init__.py @@ -5,29 +5,15 @@ for more information. """ -from __future__ import annotations - import unicodedata -import warnings -from typing import ( - TYPE_CHECKING, - Any, - BinaryIO, - Callable, - Literal, - NamedTuple, - TextIO, - cast, -) - -from ftfy import bad_codecs, chardata, fixes +from typing import List, NamedTuple, Optional, Tuple, Union + +from ftfy import bad_codecs +from ftfy import chardata, fixes from ftfy.badness import is_bad from ftfy.formatting import display_ljust -if TYPE_CHECKING: - from collections.abc import Iterator - -__version__ = "6.3.1" +__version__ = "6.0.1" # Though this function does nothing, it lets linters know that we're using @@ -35,32 +21,6 @@ bad_codecs.ok() -class ExplanationStep(NamedTuple): - """ - A step in an ExplainedText, explaining how to decode text. - - The possible actions are: - - - "encode": take in a string and encode it as bytes, with the given encoding - - "decode": take in bytes and decode them as a string, with the given encoding - - "transcode": convert bytes to bytes with a particular named function - - "apply": convert str to str with a particular named function - - The `parameter` is the name of the encoding or function to use. If it's a - function, it must appear in the FIXERS dictionary. - """ - - action: str - parameter: str - - def __repr__(self) -> str: - """ - Get the string representation of an ExplanationStep. We output the - representation of the equivalent tuple, for simplicity. - """ - return repr(tuple(self)) - - class ExplainedText(NamedTuple): """ The return type from ftfy's functions that provide an "explanation" of which @@ -69,26 +29,8 @@ class ExplainedText(NamedTuple): When the 'explain' option is disabled, these functions return the same type, but the `explanation` will be None. """ - text: str - explanation: list[ExplanationStep] | None - - -# Functions that can be applied using `apply_plan`. -FIXERS: dict[str, Callable] = { # type: ignore[type-arg] - "unescape_html": fixes.unescape_html, - "remove_terminal_escapes": fixes.remove_terminal_escapes, - "restore_byte_a0": fixes.restore_byte_a0, - "replace_lossy_sequences": fixes.replace_lossy_sequences, - "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, - "fix_c1_controls": fixes.fix_c1_controls, - "fix_latin_ligatures": fixes.fix_latin_ligatures, - "fix_character_width": fixes.fix_character_width, - "uncurl_quotes": fixes.uncurl_quotes, - "fix_line_breaks": fixes.fix_line_breaks, - "fix_surrogates": fixes.fix_surrogates, - "remove_control_chars": fixes.remove_control_chars, -} + explanation: Optional[List[Tuple[str, str]]] class TextFixerConfig(NamedTuple): @@ -211,8 +153,7 @@ class TextFixerConfig(NamedTuple): Functions that accept TextFixerConfig and don't return an explanation will automatically set `explain` to False. """ - - unescape_html: str | bool = "auto" + unescape_html: Union[str, bool] = "auto" remove_terminal_escapes: bool = True fix_encoding: bool = True restore_byte_a0: bool = True @@ -225,26 +166,25 @@ class TextFixerConfig(NamedTuple): fix_line_breaks: bool = True fix_surrogates: bool = True remove_control_chars: bool = True - normalization: Literal["NFC", "NFD", "NFKC", "NFKD"] | None = "NFC" + normalization: Optional[str] = "NFC" max_decode_length: int = 1000000 explain: bool = True -def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: - """ - Handle parameters provided as keyword arguments to ftfy's top-level - functions, converting them into a TextFixerConfig. - """ - if "fix_entities" in kwargs: - warnings.warn( - "`fix_entities` has been renamed to `unescape_html`", - DeprecationWarning, - stacklevel=2, - ) - kwargs = kwargs.copy() - kwargs["unescape_html"] = kwargs["fix_entities"] - del kwargs["fix_entities"] - return config._replace(**kwargs) +FIXERS = { + "unescape_html": fixes.unescape_html, + "remove_terminal_escapes": fixes.remove_terminal_escapes, + "restore_byte_a0": fixes.restore_byte_a0, + "replace_lossy_sequences": fixes.replace_lossy_sequences, + "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, + "fix_c1_controls": fixes.fix_c1_controls, + "fix_latin_ligatures": fixes.fix_latin_ligatures, + "fix_character_width": fixes.fix_character_width, + "uncurl_quotes": fixes.uncurl_quotes, + "fix_line_breaks": fixes.fix_line_breaks, + "fix_surrogates": fixes.fix_surrogates, + "remove_control_chars": fixes.remove_control_chars, +} BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. @@ -268,10 +208,7 @@ def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> Text def _try_fix( - fixer_name: str, - text: str, - config: TextFixerConfig, - steps: list[ExplanationStep] | None, + fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list] ) -> str: """ A helper function used across several 'fixer' steps, deciding whether to @@ -281,13 +218,13 @@ def _try_fix( fixer = FIXERS[fixer_name] fixed = fixer(text) if steps is not None and fixed != text: - steps.append(ExplanationStep("apply", fixer_name)) - return cast(str, fixed) + steps.append(("apply", fixer_name)) + return fixed return text -def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: +def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str: r""" Given Unicode text as input, fix inconsistencies and glitches in it, such as mojibake (text that was decoded in the wrong encoding). @@ -338,8 +275,9 @@ def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> """ if config is None: - config = TextFixerConfig(explain=False) - config = _config_from_kwargs(config, kwargs) + config = TextFixerConfig() + config = config._replace(**kwargs) + config = config._replace(explain=False) if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) @@ -362,7 +300,7 @@ def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> def fix_and_explain( - text: str, config: TextFixerConfig | None = None, **kwargs: Any + text: str, config: Optional[TextFixerConfig] = None, **kwargs ) -> ExplainedText: """ Fix text as a single segment, returning the fixed text and an explanation @@ -375,13 +313,13 @@ def fix_and_explain( config = TextFixerConfig() if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) - config = _config_from_kwargs(config, kwargs) + config = config._replace(**kwargs) if config.unescape_html == "auto" and "<" in text: config = config._replace(unescape_html=False) if config.explain: - steps: list[ExplanationStep] | None = [] + steps: Optional[List[Tuple[str, str]]] = [] else: # If explanations aren't desired, `steps` will be None steps = None @@ -396,8 +334,7 @@ def fix_and_explain( text = fix_encoding(text) else: text, encoding_steps = fix_encoding_and_explain(text, config) - if encoding_steps is not None: - steps.extend(encoding_steps) + steps.extend(encoding_steps) for fixer in [ "fix_c1_controls", @@ -414,7 +351,7 @@ def fix_and_explain( if config.normalization is not None: fixed = unicodedata.normalize(config.normalization, text) if steps is not None and fixed != text: - steps.append(ExplanationStep("normalize", config.normalization)) + steps.append(("normalize", config.normalization)) text = fixed if text == origtext: @@ -422,7 +359,7 @@ def fix_and_explain( def fix_encoding_and_explain( - text: str, config: TextFixerConfig | None = None, **kwargs: Any + text: str, config: Optional[TextFixerConfig] = None, **kwargs ) -> ExplainedText: """ Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed @@ -448,24 +385,25 @@ def fix_encoding_and_explain( config = TextFixerConfig() if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) - config = _config_from_kwargs(config, kwargs) + config = config._replace(**kwargs) if not config.fix_encoding: # A weird trivial case: we're asked to fix the encoding, but skip # fixing the encoding return ExplainedText(text, []) - plan_so_far: list[ExplanationStep] = [] + plan_so_far: List[Tuple[str, str]] = [] while True: prevtext = text text, plan = _fix_encoding_one_step_and_explain(text, config) - if plan is not None: - plan_so_far.extend(plan) + plan_so_far.extend(plan) if text == prevtext: return ExplainedText(text, plan_so_far) -def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: +def _fix_encoding_one_step_and_explain( + text: str, config: TextFixerConfig +) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ @@ -492,7 +430,7 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex if chardata.possible_encoding(text, encoding): possible_1byte_encodings.append(encoding) encoded_bytes = text.encode(encoding) - encode_step = ExplanationStep("encode", encoding) + encode_step = ("encode", encoding) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, @@ -501,33 +439,26 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex decoding = "utf-8" # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. - # - # Don't do this in the macroman encoding, where it would match - # an en dash followed by a space, leading to false positives. - if ( - config.restore_byte_a0 - and encoding != "macroman" - and chardata.ALTERED_UTF8_RE.search(encoded_bytes) + if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search( + encoded_bytes ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) + transcode_steps.append(("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost if config.replace_lossy_sequences and encoding.startswith("sloppy"): replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) if replaced_bytes != encoded_bytes: - transcode_steps.append( - ExplanationStep("transcode", "replace_lossy_sequences") - ) + transcode_steps.append(("transcode", "replace_lossy_sequences")) encoded_bytes = replaced_bytes if 0xED in encoded_bytes or 0xC0 in encoded_bytes: decoding = "utf-8-variants" - decode_step = ExplanationStep("decode", decoding) - steps = [encode_step, *transcode_steps, decode_step] + decode_step = ("decode", decoding) + steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return ExplainedText(fixed, steps) @@ -536,7 +467,7 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex # Look for a-hat-euro sequences that remain, and fix them in isolation. if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): - steps = [ExplanationStep("apply", "decode_inconsistent_utf8")] + steps = [("apply", "decode_inconsistent_utf8")] fixed = fixes.decode_inconsistent_utf8(text) if fixed != text: return ExplainedText(fixed, steps) @@ -556,17 +487,14 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex try: fixed = text.encode("latin-1").decode("windows-1252") if fixed != text: - steps = [ - ExplanationStep("encode", "latin-1"), - ExplanationStep("decode", "windows-1252"), - ] + steps = [("encode", "latin-1"), ("decode", "windows-1252")] return ExplainedText(fixed, steps) except UnicodeDecodeError: pass # Fix individual characters of Latin-1 with a less satisfying explanation if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): - steps = [ExplanationStep("transcode", "fix_c1_controls")] + steps = [("transcode", "fix_c1_controls")] fixed = fixes.fix_c1_controls(text) return ExplainedText(fixed, steps) @@ -579,7 +507,7 @@ def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> Ex return ExplainedText(text, []) -def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: +def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs): """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. @@ -591,7 +519,7 @@ def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any """ if config is None: config = TextFixerConfig(explain=False) - config = _config_from_kwargs(config, kwargs) + config = config._replace(**kwargs) fixed, _explan = fix_encoding_and_explain(text, config) return fixed @@ -600,24 +528,19 @@ def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any ftfy = fix_text -def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: +def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs): """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. """ if config is None: config = TextFixerConfig(explain=False) - config = _config_from_kwargs(config, kwargs) + config = config._replace(**kwargs) fixed, _explan = fix_and_explain(text, config) return fixed -def fix_file( - input_file: TextIO | BinaryIO, - encoding: str | None = None, - config: TextFixerConfig | None = None, - **kwargs: Any, -) -> Iterator[str]: +def fix_file(input_file, encoding=None, config=None, **kwargs): """ Fix text that is found in a file. @@ -630,7 +553,7 @@ def fix_file( """ if config is None: config = TextFixerConfig() - config = _config_from_kwargs(config, kwargs) + config = config._replace(**kwargs) for line in input_file: if isinstance(line, bytes): @@ -645,7 +568,7 @@ def fix_file( yield fixed_line -def guess_bytes(bstring: bytes) -> tuple[str, str]: +def guess_bytes(bstring): """ NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy is not designed to be an encoding detector. @@ -671,13 +594,12 @@ def guess_bytes(bstring: bytes) -> tuple[str, str]: single-byte encoding. """ if isinstance(bstring, str): - msg = ( + raise UnicodeError( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) - raise UnicodeError(msg) - if bstring.startswith((b"\xfe\xff", b"\xff\xfe")): + if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): return bstring.decode("utf-16"), "utf-16" byteset = set(bstring) @@ -717,7 +639,7 @@ def guess_bytes(bstring: bytes) -> tuple[str, str]: return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252" -def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: +def apply_plan(text: str, plan: List[Tuple[str, str]]): """ Apply a plan for fixing the encoding of text. @@ -744,23 +666,21 @@ def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: obj = text for operation, encoding in plan: if operation == "encode": - obj = obj.encode(encoding) # type: ignore + obj = obj.encode(encoding) elif operation == "decode": - obj = obj.decode(encoding) # type: ignore + obj = obj.decode(encoding) elif operation in ("transcode", "apply"): if encoding in FIXERS: obj = FIXERS[encoding](obj) else: - msg = f"Unknown function to apply: {encoding}" - raise ValueError(msg) + raise ValueError("Unknown function to apply: %s" % encoding) else: - msg = f"Unknown plan step: {operation}" - raise ValueError(msg) + raise ValueError("Unknown plan step: %s" % operation) return obj -def explain_unicode(text: str) -> None: +def explain_unicode(text: str): """ A utility method that's useful for debugging mysterious Unicode. diff --git a/ftfy/bad_codecs/__init__.py b/ftfy/bad_codecs/__init__.py index a449a38e..c5486bd5 100644 --- a/ftfy/bad_codecs/__init__.py +++ b/ftfy/bad_codecs/__init__.py @@ -1,5 +1,5 @@ r""" -The `ftfy.bad_codecs` module gives Python the ability to decode some common, +The `ftfy.bad_codecs` module gives Python the ability to decode some common, flawed encodings. Python does not want you to be sloppy with your text. Its encoders and decoders @@ -29,30 +29,24 @@ >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants')) 😍 """ - -import codecs from encodings import normalize_encoding -from typing import Optional +import codecs +from typing import Dict -_CACHE: dict[str, codecs.CodecInfo] = {} +_CACHE: Dict[str, codecs.CodecInfo] = {} # Define some aliases for 'utf-8-variants'. All hyphens get turned into # underscores, because of `normalize_encoding`. UTF8_VAR_NAMES = ( - "utf_8_variants", - "utf8_variants", - "utf_8_variant", - "utf8_variant", - "utf_8_var", - "utf8_var", - "cesu_8", - "cesu8", - "java_utf_8", - "java_utf8", + 'utf_8_variants', 'utf8_variants', + 'utf_8_variant', 'utf8_variant', + 'utf_8_var', 'utf8_var', + 'cesu_8', 'cesu8', + 'java_utf_8', 'java_utf8' ) -def search_function(encoding: str) -> Optional[codecs.CodecInfo]: +def search_function(encoding): """ Register our "bad codecs" with Python's codecs API. This involves adding a search function that takes in an encoding name, and returns a codec @@ -73,11 +67,9 @@ def search_function(encoding: str) -> Optional[codecs.CodecInfo]: codec = None if norm_encoding in UTF8_VAR_NAMES: from ftfy.bad_codecs.utf8_variants import CODEC_INFO - codec = CODEC_INFO - elif norm_encoding.startswith("sloppy_"): + elif norm_encoding.startswith('sloppy_'): from ftfy.bad_codecs.sloppy import CODECS - codec = CODECS.get(norm_encoding) if codec is not None: @@ -86,7 +78,7 @@ def search_function(encoding: str) -> Optional[codecs.CodecInfo]: return codec -def ok() -> None: +def ok(): """ A feel-good function that gives you something to call after importing this package. diff --git a/ftfy/bad_codecs/sloppy.py b/ftfy/bad_codecs/sloppy.py index 8c65e4fe..0503a55f 100644 --- a/ftfy/bad_codecs/sloppy.py +++ b/ftfy/bad_codecs/sloppy.py @@ -71,16 +71,15 @@ U+0081 \x81 [Cc] U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK """ - -from __future__ import annotations - import codecs from encodings import normalize_encoding +import sys -REPLACEMENT_CHAR = "\ufffd" +REPLACEMENT_CHAR = '\ufffd' +PY26 = sys.version_info[:2] == (2, 6) -def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: +def make_sloppy_codec(encoding): """ Take a codec name, and return a 'sloppy' version of that codec that can encode and decode the unassigned bytes in that encoding. @@ -94,11 +93,14 @@ def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: all_bytes = bytes(range(256)) # Get a list of what they would decode to in Latin-1. - sloppy_chars = list(all_bytes.decode("latin-1")) + sloppy_chars = list(all_bytes.decode('latin-1')) # Get a list of what they decode to in the given encoding. Use the # replacement character for unassigned bytes. - decoded_chars = all_bytes.decode(encoding, errors="replace") + if PY26: + decoded_chars = all_bytes.decode(encoding, 'replace') + else: + decoded_chars = all_bytes.decode(encoding, errors='replace') # Update the sloppy_chars list. Each byte that was successfully decoded # gets its decoded value in the list. The unassigned bytes are left as @@ -109,30 +111,30 @@ def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute" # control code, to encode the Unicode replacement character U+FFFD. - sloppy_chars[0x1A] = REPLACEMENT_CHAR + sloppy_chars[0x1a] = REPLACEMENT_CHAR # Create the data structures that tell the charmap methods how to encode # and decode in this sloppy encoding. - decoding_table = "".join(sloppy_chars) + decoding_table = ''.join(sloppy_chars) encoding_table = codecs.charmap_build(decoding_table) # Now produce all the class boilerplate. Look at the Python source for # `encodings.cp1252` for comparison; this is almost exactly the same, # except I made it follow pep8. class Codec(codecs.Codec): - def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]: + def encode(self, input, errors='strict'): return codecs.charmap_encode(input, errors, encoding_table) - def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]: - return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type] + def decode(self, input, errors='strict'): + return codecs.charmap_decode(input, errors, decoding_table) class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input: str, final: bool = False) -> bytes: + def encode(self, input, final=False): return codecs.charmap_encode(input, self.errors, encoding_table)[0] class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override] - return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type] + def decode(self, input, final=False): + return codecs.charmap_decode(input, self.errors, decoding_table)[0] class StreamWriter(Codec, codecs.StreamWriter): pass @@ -141,9 +143,9 @@ class StreamReader(Codec, codecs.StreamReader): pass return codecs.CodecInfo( - name="sloppy-" + encoding, + name='sloppy-' + encoding, encode=Codec().encode, - decode=Codec().decode, # type: ignore[arg-type] + decode=Codec().decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, @@ -155,12 +157,11 @@ class StreamReader(Codec, codecs.StreamReader): # can be used by the main module of ftfy.bad_codecs. CODECS = {} INCOMPLETE_ENCODINGS = ( - [f"windows-{num}" for num in range(1250, 1259)] - + [f"iso-8859-{num}" for num in (3, 6, 7, 8, 11)] - + [f"cp{num}" for num in range(1250, 1259)] - + ["cp874"] + ['windows-%s' % num for num in range(1250, 1259)] + + ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] + + ['cp%s' % num for num in range(1250, 1259)] + ['cp874'] ) for _encoding in INCOMPLETE_ENCODINGS: - _new_name = normalize_encoding("sloppy-" + _encoding) + _new_name = normalize_encoding('sloppy-' + _encoding) CODECS[_new_name] = make_sloppy_codec(_encoding) diff --git a/ftfy/bad_codecs/utf8_variants.py b/ftfy/bad_codecs/utf8_variants.py index eaac3c14..566d2ee6 100644 --- a/ftfy/bad_codecs/utf8_variants.py +++ b/ftfy/bad_codecs/utf8_variants.py @@ -39,46 +39,42 @@ ftfy instead. """ -import codecs import re -from encodings.utf_8 import ( - IncrementalDecoder as UTF8IncrementalDecoder, -) -from encodings.utf_8 import ( - IncrementalEncoder as UTF8IncrementalEncoder, -) -from typing import Callable, Optional +import codecs +from typing import Tuple +from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder, + IncrementalEncoder as UTF8IncrementalEncoder) -NAME = "utf-8-variants" +NAME = 'utf-8-variants' # This regular expression matches all possible six-byte CESU-8 sequences, # plus truncations of them at the end of the string. (If any of the # subgroups matches $, then all the subgroups after it also have to match $, # as there are no more characters to match.) CESU8_EXPR = ( - b"(" - b"\xed" - b"([\xa0-\xaf]|$)" - b"([\x80-\xbf]|$)" - b"(\xed|$)" - b"([\xb0-\xbf]|$)" - b"([\x80-\xbf]|$)" - b")" + b'(' + b'\xed' + b'([\xa0-\xaf]|$)' + b'([\x80-\xbf]|$)' + b'(\xed|$)' + b'([\xb0-\xbf]|$)' + b'([\x80-\xbf]|$)' + b')' ) CESU8_RE = re.compile(CESU8_EXPR) # This expression matches isolated surrogate characters that aren't # CESU-8, which have to be handled carefully on Python 2. -SURROGATE_EXPR = b"(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))" +SURROGATE_EXPR = (b'(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))') # This expression matches the Java encoding of U+0, including if it's # truncated and we need more bytes. -NULL_EXPR = b"(\xc0(\x80|$))" +NULL_EXPR = b'(\xc0(\x80|$))' # This regex matches cases that we need to decode differently from # standard UTF-8. -SPECIAL_BYTES_RE = re.compile(b"|".join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR])) +SPECIAL_BYTES_RE = re.compile(b'|'.join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR])) class IncrementalDecoder(UTF8IncrementalDecoder): @@ -91,11 +87,7 @@ class IncrementalDecoder(UTF8IncrementalDecoder): the real UTF-8 decoder is way optimized, but to call specialized methods we define here for the cases the real encoder isn't expecting. """ - - @staticmethod - def _buffer_decode( # type: ignore[override] - input: bytes, errors: Optional[str], final: bool - ) -> tuple[str, int]: + def _buffer_decode(self, input, errors, final): """ Decode bytes that may be arriving in a stream, following the Codecs API. @@ -117,8 +109,10 @@ def _buffer_decode( # type: ignore[override] position = 0 while True: # Use _buffer_decode_step to decode a segment of text. - decoded, consumed = IncrementalDecoder._buffer_decode_step( - input[position:], errors, final + decoded, consumed = self._buffer_decode_step( + input[position:], + errors, + final ) if consumed == 0: # Either there's nothing left to decode, or we need to wait @@ -134,10 +128,9 @@ def _buffer_decode( # type: ignore[override] # true. assert position == len(input) - return "".join(decoded_segments), position + return ''.join(decoded_segments), position - @staticmethod - def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tuple[str, int]: + def _buffer_decode_step(self, input, errors, final): """ There are three possibilities for each decoding step: @@ -162,26 +155,24 @@ def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tup # Some byte sequence that we intend to handle specially matches # at the beginning of the input. - if input.startswith(b"\xc0"): + if input.startswith(b'\xc0'): if len(input) > 1: # Decode the two-byte sequence 0xc0 0x80. - return "\u0000", 2 - if final: - # We hit the end of the stream. Let the superclass method - # handle it. - return sup(input, errors, True) - # Wait to see another byte. - return "", 0 - # Decode a possible six-byte sequence starting with 0xed. - return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) + return '\u0000', 2 + else: + if final: + # We hit the end of the stream. Let the superclass method + # handle it. + return sup(input, errors, True) + else: + # Wait to see another byte. + return '', 0 + else: + # Decode a possible six-byte sequence starting with 0xed. + return self._buffer_decode_surrogates(sup, input, errors, final) @staticmethod - def _buffer_decode_surrogates( - sup: Callable[[bytes, Optional[str], bool], tuple[str, int]], - input: bytes, - errors: Optional[str], - final: bool, - ) -> tuple[str, int]: + def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. @@ -202,47 +193,56 @@ def _buffer_decode_surrogates( # handle it as normal UTF-8. It might be a Hangul character # or an error. return sup(input, errors, final) - # We found a surrogate, the stream isn't over yet, and we don't - # know enough of the following bytes to decode anything, so - # consume zero bytes and wait. - return "", 0 - if CESU8_RE.match(input): - # Given this is a CESU-8 sequence, do some math to pull out - # the intended 20-bit value, and consume six bytes. - codepoint = ( - ((input[1] & 0x0F) << 16) - + ((input[2] & 0x3F) << 10) - + ((input[4] & 0x0F) << 6) - + (input[5] & 0x3F) - + 0x10000 - ) - return chr(codepoint), 6 - # This looked like a CESU-8 sequence, but it wasn't one. - # 0xed indicates the start of a three-byte sequence, so give - # three bytes to the superclass to decode as usual. - return sup(input[:3], errors, False) + else: + # We found a surrogate, the stream isn't over yet, and we don't + # know enough of the following bytes to decode anything, so + # consume zero bytes and wait. + return '', 0 + else: + if CESU8_RE.match(input): + # Given this is a CESU-8 sequence, do some math to pull out + # the intended 20-bit value, and consume six bytes. + codepoint = ( + ((input[1] & 0x0f) << 16) + + ((input[2] & 0x3f) << 10) + + ((input[4] & 0x0f) << 6) + + (input[5] & 0x3f) + + 0x10000 + ) + return chr(codepoint), 6 + else: + # This looked like a CESU-8 sequence, but it wasn't one. + # 0xed indicates the start of a three-byte sequence, so give + # three bytes to the superclass to decode as usual. + return sup(input[:3], errors, False) # The encoder is identical to UTF-8. IncrementalEncoder = UTF8IncrementalEncoder +# Everything below here is boilerplate that matches the modules in the +# built-in `encodings` package. +def encode(input, errors='strict'): + return IncrementalEncoder(errors).encode(input, final=True), len(input) + + +def decode(input, errors='strict'): + return IncrementalDecoder(errors).decode(input, final=True), len(input) + + class StreamWriter(codecs.StreamWriter): - @staticmethod - def encode(input: str, errors: str = "strict") -> tuple[bytes, int]: - return IncrementalEncoder(errors).encode(input, final=True), len(input) + encode = encode class StreamReader(codecs.StreamReader): - @staticmethod - def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: - return IncrementalDecoder(errors).decode(input, final=True), len(input) + decode = decode CODEC_INFO = codecs.CodecInfo( name=NAME, - encode=StreamWriter.encode, - decode=StreamReader.decode, # type: ignore[arg-type] + encode=encode, + decode=decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, diff --git a/ftfy/badness.py b/ftfy/badness.py index 38ec1f44..ce44be86 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -14,9 +14,10 @@ import warnings import re +from ftfy import chardata -# There are only a few hundred characters that occur in known UTF-8 mojibake, and we can +# There are only 403 characters that occur in known UTF-8 mojibake, and we can # characterize them: MOJIBAKE_CATEGORIES = { @@ -42,6 +43,8 @@ "\N{DIAERESIS}" "\N{NOT SIGN}" "\N{MACRON}" + "\N{PILCROW SIGN}" + "\N{SECTION SIGN}" "\N{CEDILLA}" "\N{LATIN SMALL LETTER F WITH HOOK}" "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier @@ -60,11 +63,6 @@ "\N{FEMININE ORDINAL INDICATOR}" "\N{MASCULINE ORDINAL INDICATOR}" ), - # Characters used in legalese - "law": ( - "\N{PILCROW SIGN}" - "\N{SECTION SIGN}" - ), "currency": ( "\N{CENT SIGN}" "\N{POUND SIGN}" @@ -135,9 +133,6 @@ "ò-ö" "ø-ü" "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" - "\N{LATIN CAPITAL LETTER O WITH MACRON}" - "\N{LATIN CAPITAL LETTER U WITH MACRON}" - "\N{LATIN CAPITAL LETTER U WITH OGONEK}" "\N{DEGREE SIGN}" ), "upper_accented": ( @@ -149,7 +144,6 @@ "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" "\N{LATIN CAPITAL LETTER A WITH BREVE}" - "\N{LATIN CAPITAL LETTER A WITH MACRON}" "\N{LATIN CAPITAL LETTER A WITH OGONEK}" "\N{LATIN CAPITAL LETTER C WITH ACUTE}" "\N{LATIN CAPITAL LETTER C WITH CARON}" @@ -157,20 +151,13 @@ "\N{LATIN CAPITAL LETTER D WITH STROKE}" "\N{LATIN CAPITAL LETTER E WITH OGONEK}" "\N{LATIN CAPITAL LETTER E WITH CARON}" - "\N{LATIN CAPITAL LETTER E WITH MACRON}" - "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" "\N{LATIN CAPITAL LETTER G WITH BREVE}" - "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" - "\N{LATIN CAPITAL LETTER I WITH MACRON}" - "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" "\N{LATIN CAPITAL LETTER L WITH ACUTE}" "\N{LATIN CAPITAL LETTER L WITH CARON}" "\N{LATIN CAPITAL LETTER L WITH STROKE}" - "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" "\N{LATIN CAPITAL LETTER N WITH ACUTE}" "\N{LATIN CAPITAL LETTER N WITH CARON}" - "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" "\N{LATIN CAPITAL LIGATURE OE}" "\N{LATIN CAPITAL LETTER R WITH CARON}" "\N{LATIN CAPITAL LETTER S WITH ACUTE}" @@ -193,31 +180,22 @@ # skip o's and u's that could be used in kaomoji "\N{LATIN SMALL LETTER A WITH BREVE}" "\N{LATIN SMALL LETTER A WITH OGONEK}" - "\N{LATIN SMALL LETTER A WITH MACRON}" "\N{LATIN SMALL LETTER C WITH ACUTE}" "\N{LATIN SMALL LETTER C WITH CARON}" "\N{LATIN SMALL LETTER D WITH CARON}" "\N{LATIN SMALL LETTER D WITH STROKE}" "\N{LATIN SMALL LETTER E WITH OGONEK}" "\N{LATIN SMALL LETTER E WITH CARON}" - "\N{LATIN SMALL LETTER E WITH MACRON}" - "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" "\N{LATIN SMALL LETTER G WITH BREVE}" - "\N{LATIN SMALL LETTER G WITH CEDILLA}" - "\N{LATIN SMALL LETTER I WITH OGONEK}" - "\N{LATIN SMALL LETTER I WITH MACRON}" - "\N{LATIN SMALL LETTER K WITH CEDILLA}" "\N{LATIN SMALL LETTER L WITH ACUTE}" "\N{LATIN SMALL LETTER L WITH CARON}" "\N{LATIN SMALL LETTER L WITH STROKE}" - "\N{LATIN SMALL LETTER L WITH CEDILLA}" "\N{LATIN SMALL LIGATURE OE}" "\N{LATIN SMALL LETTER R WITH ACUTE}" "\N{LATIN SMALL LETTER S WITH ACUTE}" "\N{LATIN SMALL LETTER S WITH CEDILLA}" "\N{LATIN SMALL LETTER S WITH CARON}" "\N{LATIN SMALL LETTER T WITH CARON}" - "\N{LATIN SMALL LETTER U WITH DIAERESIS}" "\N{LATIN SMALL LETTER Z WITH ACUTE}" "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" "\N{LATIN SMALL LETTER Z WITH CARON}" @@ -275,25 +253,27 @@ r""" [{c1}] | - [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] [{bad}] + [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}] | [a-zA-Z] [{lower_common}{upper_common}] [{bad}] | - [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] + [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] | [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] | [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}] | + # leave out [upper_accented][currency] without further info, because it's used in some + # fancy leetspeak-esque writing [{lower_accented}{box}{end_punctuation}] [{currency}] | \s [{upper_accented}] [{currency}] | - [{upper_accented}{box}] [{numeric}{law}] + [{upper_accented}{box}] [{numeric}] | [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] | - [{lower_accented}{upper_accented}{currency}{numeric}{box}{law}] [{end_punctuation}] [{start_punctuation}] + [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}] | [{currency}{numeric}{box}] [{start_punctuation}] | @@ -301,23 +281,19 @@ | [{box}] [{kaomoji}] | - [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}{law}] [{box}] + [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}] | [{box}] [{end_punctuation}] | - [{lower_accented}{upper_accented}] [{start_punctuation}{end_punctuation}] \w + [{lower_accented}{upper_accented}] [{end_punctuation}] \\w | # The ligature œ when not followed by an unaccented Latin letter [Œœ][^A-Za-z] | - # Degree signs after capital letters - [{upper_accented}]° - | - # Common Windows-1252 2-character mojibake that isn't covered by the cases above - [ÂÃÎÐ][€œŠš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] + [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{end_punctuation}–—´] | × [²³] | @@ -364,7 +340,7 @@ | ^[ÃÂ][ ] | - + # Cases where  precedes a character as an encoding of exactly the same # character, and the character is common enough [a-z.,?!{end_punctuation}]  [ {start_punctuation}{end_punctuation}] @@ -376,18 +352,14 @@ # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] - | - - # Windows-1257 mojibake of characters in the U+2000 range - †- """.format( +""".format( **MOJIBAKE_CATEGORIES ), re.VERBOSE, ) -def sequence_weirdness(text: str) -> int: +def sequence_weirdness(text): """ This was the name of the heuristic used in ftfy 2.x through 5.x. As an attempt at compatibility with external code that calls the heuristic @@ -400,7 +372,7 @@ def sequence_weirdness(text: str) -> int: return badness(text) -def badness(text: str) -> int: +def badness(text): """ Get the 'badness' of a sequence of text, counting the number of unlikely character sequences. A badness greater than 0 indicates that some of it @@ -409,7 +381,7 @@ def badness(text: str) -> int: return len(BADNESS_RE.findall(text)) -def is_bad(text: str) -> bool: +def is_bad(text): """ Returns true iff the given text looks like it contains mojibake. diff --git a/ftfy/chardata.py b/ftfy/chardata.py index 43d117c6..8be84a52 100644 --- a/ftfy/chardata.py +++ b/ftfy/chardata.py @@ -3,13 +3,12 @@ encodings that use them. """ -from __future__ import annotations - import html import itertools import re import unicodedata + # These are the encodings we will try to fix in ftfy, in the # order that they should be tried. CHARMAP_ENCODINGS = [ @@ -19,7 +18,6 @@ "sloppy-windows-1250", "sloppy-windows-1253", "sloppy-windows-1254", - "sloppy-windows-1257", "iso-8859-2", "macroman", "cp437", @@ -29,7 +27,7 @@ DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") -def _build_regexes() -> dict[str, re.Pattern[str]]: +def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is @@ -43,7 +41,7 @@ def _build_regexes() -> dict[str, re.Pattern[str]]: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. - byte_range = bytes([*range(0x80, 0x100), 0x1A]) + byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) charlist = byte_range.decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B @@ -51,7 +49,7 @@ def _build_regexes() -> dict[str, re.Pattern[str]]: # support, so we can just include them as ranges. This also lets us # not worry about escaping regex special characters, because all of # them are in the \x1B to \x7F range. - regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" + regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes @@ -59,12 +57,12 @@ def _build_regexes() -> dict[str, re.Pattern[str]]: ENCODING_REGEXES = _build_regexes() -def _build_html_entities() -> dict[str, str]: +def _build_html_entities(): entities = {} # Create a dictionary based on the built-in HTML5 entity dictionary. # Add a limited set of HTML entities that we'll also decode if they've # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ". - for name, char in html.entities.html5.items(): # type: ignore + for name, char in html.entities.html5.items(): if name.endswith(";"): entities["&" + name] = char @@ -83,7 +81,7 @@ def _build_html_entities() -> dict[str, str]: HTML_ENTITIES = _build_html_entities() -def possible_encoding(text: str, encoding: str) -> bool: +def possible_encoding(text, encoding): """ Given text and a single-byte encoding, check whether that text could have been decoded from that single-byte encoding. @@ -94,13 +92,13 @@ def possible_encoding(text: str, encoding: str) -> bool: return bool(ENCODING_REGEXES[encoding].match(text)) -def _build_control_char_mapping() -> dict[int, None]: +def _build_control_char_mapping(): """ Build a translate mapping that strips likely-unintended control characters. See :func:`ftfy.fixes.remove_control_chars` for a description of these codepoint ranges and why they should be removed. """ - control_chars: dict[int, None] = {} + control_chars = {} for i in itertools.chain( range(0x00, 0x09), @@ -230,7 +228,7 @@ def _build_control_char_mapping() -> dict[int, None]: } -def _build_width_map() -> dict[int, str]: +def _build_width_map(): """ Build a translate mapping that replaces halfwidth and fullwidth forms with their standard-width forms. @@ -252,410 +250,34 @@ def _build_width_map() -> dict[int, str]: # Character classes that help us pinpoint embedded mojibake. These can # include common characters, because we'll also check them for 'badness'. -# -# Though they go on for many lines, the members of this dictionary are -# single concatenated strings. -# -# This code is generated using scripts/char_data_table.py. -UTF8_CLUES: dict[str, str] = { +UTF8_CLUES = { # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding "utf8_first_of_2": ( - "\N{LATIN CAPITAL LETTER A WITH BREVE}" # windows-1250:C3 - "\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" # latin-1:C2 - "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" # latin-1:C4 - "\N{LATIN CAPITAL LETTER A WITH MACRON}" # windows-1257:C2 - "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" # latin-1:C5 - "\N{LATIN CAPITAL LETTER A WITH TILDE}" # latin-1:C3 - "\N{LATIN CAPITAL LETTER AE}" # latin-1:C6 - "\N{LATIN CAPITAL LETTER C WITH ACUTE}" # windows-1250:C6 - "\N{LATIN CAPITAL LETTER C WITH CARON}" # windows-1250:C8 - "\N{LATIN CAPITAL LETTER C WITH CEDILLA}" # latin-1:C7 - "\N{LATIN CAPITAL LETTER D WITH CARON}" # windows-1250:CF - "\N{LATIN CAPITAL LETTER D WITH STROKE}" # windows-1250:D0 - "\N{LATIN CAPITAL LETTER E WITH ACUTE}" # latin-1:C9 - "\N{LATIN CAPITAL LETTER E WITH CARON}" # windows-1250:CC - "\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" # latin-1:CA - "\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" # latin-1:CB - "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" # windows-1257:CB - "\N{LATIN CAPITAL LETTER E WITH GRAVE}" # latin-1:C8 - "\N{LATIN CAPITAL LETTER E WITH MACRON}" # windows-1257:C7 - "\N{LATIN CAPITAL LETTER E WITH OGONEK}" # windows-1250:CA - "\N{LATIN CAPITAL LETTER ETH}" # latin-1:D0 - "\N{LATIN CAPITAL LETTER G WITH BREVE}" # windows-1254:D0 - "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" # windows-1257:CC - "\N{LATIN CAPITAL LETTER I WITH ACUTE}" # latin-1:CD - "\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" # latin-1:CE - "\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" # latin-1:CF - "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" # windows-1254:DD - "\N{LATIN CAPITAL LETTER I WITH GRAVE}" # latin-1:CC - "\N{LATIN CAPITAL LETTER I WITH MACRON}" # windows-1257:CE - "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" # windows-1257:CD - "\N{LATIN CAPITAL LETTER L WITH ACUTE}" # windows-1250:C5 - "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" # windows-1257:CF - "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1257:D9 - "\N{LATIN CAPITAL LETTER N WITH ACUTE}" # windows-1250:D1 - "\N{LATIN CAPITAL LETTER N WITH CARON}" # windows-1250:D2 - "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" # windows-1257:D2 - "\N{LATIN CAPITAL LETTER N WITH TILDE}" # latin-1:D1 - "\N{LATIN CAPITAL LETTER O WITH ACUTE}" # latin-1:D3 - "\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" # latin-1:D4 - "\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" # latin-1:D6 - "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" # windows-1250:D5 - "\N{LATIN CAPITAL LETTER O WITH GRAVE}" # latin-1:D2 - "\N{LATIN CAPITAL LETTER O WITH MACRON}" # windows-1257:D4 - "\N{LATIN CAPITAL LETTER O WITH STROKE}" # latin-1:D8 - "\N{LATIN CAPITAL LETTER O WITH TILDE}" # latin-1:D5 - "\N{LATIN CAPITAL LETTER R WITH CARON}" # windows-1250:D8 - "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1257:DA - "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1257:D0 - "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1254:DE - "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" # windows-1250:DE - "\N{LATIN CAPITAL LETTER THORN}" # latin-1:DE - "\N{LATIN CAPITAL LETTER U WITH ACUTE}" # latin-1:DA - "\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" # latin-1:DB - "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" # latin-1:DC - "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" # windows-1250:DB - "\N{LATIN CAPITAL LETTER U WITH GRAVE}" # latin-1:D9 - "\N{LATIN CAPITAL LETTER U WITH MACRON}" # windows-1257:DB - "\N{LATIN CAPITAL LETTER U WITH OGONEK}" # windows-1257:D8 - "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" # windows-1250:D9 - "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" # latin-1:DD - "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1257:CA - "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1257:DE - "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1257:DD - "\N{LATIN SMALL LETTER SHARP S}" # latin-1:DF - "\N{MULTIPLICATION SIGN}" # latin-1:D7 - "\N{GREEK CAPITAL LETTER BETA}" # windows-1253:C2 - "\N{GREEK CAPITAL LETTER GAMMA}" # windows-1253:C3 - "\N{GREEK CAPITAL LETTER DELTA}" # windows-1253:C4 - "\N{GREEK CAPITAL LETTER EPSILON}" # windows-1253:C5 - "\N{GREEK CAPITAL LETTER ZETA}" # windows-1253:C6 - "\N{GREEK CAPITAL LETTER ETA}" # windows-1253:C7 - "\N{GREEK CAPITAL LETTER THETA}" # windows-1253:C8 - "\N{GREEK CAPITAL LETTER IOTA}" # windows-1253:C9 - "\N{GREEK CAPITAL LETTER KAPPA}" # windows-1253:CA - "\N{GREEK CAPITAL LETTER LAMDA}" # windows-1253:CB - "\N{GREEK CAPITAL LETTER MU}" # windows-1253:CC - "\N{GREEK CAPITAL LETTER NU}" # windows-1253:CD - "\N{GREEK CAPITAL LETTER XI}" # windows-1253:CE - "\N{GREEK CAPITAL LETTER OMICRON}" # windows-1253:CF - "\N{GREEK CAPITAL LETTER PI}" # windows-1253:D0 - "\N{GREEK CAPITAL LETTER RHO}" # windows-1253:D1 - "\N{GREEK CAPITAL LETTER SIGMA}" # windows-1253:D3 - "\N{GREEK CAPITAL LETTER TAU}" # windows-1253:D4 - "\N{GREEK CAPITAL LETTER UPSILON}" # windows-1253:D5 - "\N{GREEK CAPITAL LETTER PHI}" # windows-1253:D6 - "\N{GREEK CAPITAL LETTER CHI}" # windows-1253:D7 - "\N{GREEK CAPITAL LETTER PSI}" # windows-1253:D8 - "\N{GREEK CAPITAL LETTER OMEGA}" # windows-1253:D9 - "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" # windows-1253:DA - "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" # windows-1253:DB - "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" # windows-1253:DC - "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" # windows-1253:DD - "\N{GREEK SMALL LETTER ETA WITH TONOS}" # windows-1253:DE - "\N{GREEK SMALL LETTER IOTA WITH TONOS}" # windows-1253:DF - "\N{CYRILLIC CAPITAL LETTER VE}" # windows-1251:C2 - "\N{CYRILLIC CAPITAL LETTER GHE}" # windows-1251:C3 - "\N{CYRILLIC CAPITAL LETTER DE}" # windows-1251:C4 - "\N{CYRILLIC CAPITAL LETTER IE}" # windows-1251:C5 - "\N{CYRILLIC CAPITAL LETTER ZHE}" # windows-1251:C6 - "\N{CYRILLIC CAPITAL LETTER ZE}" # windows-1251:C7 - "\N{CYRILLIC CAPITAL LETTER I}" # windows-1251:C8 - "\N{CYRILLIC CAPITAL LETTER SHORT I}" # windows-1251:C9 - "\N{CYRILLIC CAPITAL LETTER KA}" # windows-1251:CA - "\N{CYRILLIC CAPITAL LETTER EL}" # windows-1251:CB - "\N{CYRILLIC CAPITAL LETTER EM}" # windows-1251:CC - "\N{CYRILLIC CAPITAL LETTER EN}" # windows-1251:CD - "\N{CYRILLIC CAPITAL LETTER O}" # windows-1251:CE - "\N{CYRILLIC CAPITAL LETTER PE}" # windows-1251:CF - "\N{CYRILLIC CAPITAL LETTER ER}" # windows-1251:D0 - "\N{CYRILLIC CAPITAL LETTER ES}" # windows-1251:D1 - "\N{CYRILLIC CAPITAL LETTER TE}" # windows-1251:D2 - "\N{CYRILLIC CAPITAL LETTER U}" # windows-1251:D3 - "\N{CYRILLIC CAPITAL LETTER EF}" # windows-1251:D4 - "\N{CYRILLIC CAPITAL LETTER HA}" # windows-1251:D5 - "\N{CYRILLIC CAPITAL LETTER TSE}" # windows-1251:D6 - "\N{CYRILLIC CAPITAL LETTER CHE}" # windows-1251:D7 - "\N{CYRILLIC CAPITAL LETTER SHA}" # windows-1251:D8 - "\N{CYRILLIC CAPITAL LETTER SHCHA}" # windows-1251:D9 - "\N{CYRILLIC CAPITAL LETTER HARD SIGN}" # windows-1251:DA - "\N{CYRILLIC CAPITAL LETTER YERU}" # windows-1251:DB - "\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" # windows-1251:DC - "\N{CYRILLIC CAPITAL LETTER E}" # windows-1251:DD - "\N{CYRILLIC CAPITAL LETTER YU}" # windows-1251:DE - "\N{CYRILLIC CAPITAL LETTER YA}" # windows-1251:DF + "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ" + "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" ), # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding - "utf8_first_of_3": ( - "\N{LATIN SMALL LETTER A WITH ACUTE}" # latin-1:E1 - "\N{LATIN SMALL LETTER A WITH BREVE}" # windows-1250:E3 - "\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" # latin-1:E2 - "\N{LATIN SMALL LETTER A WITH DIAERESIS}" # latin-1:E4 - "\N{LATIN SMALL LETTER A WITH GRAVE}" # latin-1:E0 - "\N{LATIN SMALL LETTER A WITH MACRON}" # windows-1257:E2 - "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1257:E0 - "\N{LATIN SMALL LETTER A WITH RING ABOVE}" # latin-1:E5 - "\N{LATIN SMALL LETTER A WITH TILDE}" # latin-1:E3 - "\N{LATIN SMALL LETTER AE}" # latin-1:E6 - "\N{LATIN SMALL LETTER C WITH ACUTE}" # windows-1250:E6 - "\N{LATIN SMALL LETTER C WITH CARON}" # windows-1250:E8 - "\N{LATIN SMALL LETTER C WITH CEDILLA}" # latin-1:E7 - "\N{LATIN SMALL LETTER D WITH CARON}" # windows-1250:EF - "\N{LATIN SMALL LETTER E WITH ACUTE}" # latin-1:E9 - "\N{LATIN SMALL LETTER E WITH CARON}" # windows-1250:EC - "\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" # latin-1:EA - "\N{LATIN SMALL LETTER E WITH DIAERESIS}" # latin-1:EB - "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" # windows-1257:EB - "\N{LATIN SMALL LETTER E WITH GRAVE}" # latin-1:E8 - "\N{LATIN SMALL LETTER E WITH MACRON}" # windows-1257:E7 - "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA - "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA - "\N{LATIN SMALL LETTER G WITH CEDILLA}" # windows-1257:EC - "\N{LATIN SMALL LETTER I WITH ACUTE}" # latin-1:ED - "\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" # latin-1:EE - "\N{LATIN SMALL LETTER I WITH DIAERESIS}" # latin-1:EF - "\N{LATIN SMALL LETTER I WITH GRAVE}" # latin-1:EC - "\N{LATIN SMALL LETTER I WITH MACRON}" # windows-1257:EE - "\N{LATIN SMALL LETTER I WITH OGONEK}" # windows-1257:E1 - "\N{LATIN SMALL LETTER K WITH CEDILLA}" # windows-1257:ED - "\N{LATIN SMALL LETTER L WITH ACUTE}" # windows-1250:E5 - "\N{LATIN SMALL LETTER L WITH CEDILLA}" # windows-1257:EF - "\N{LATIN SMALL LETTER R WITH ACUTE}" # windows-1250:E0 - "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1257:EA - "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" # windows-1253:E0 - "\N{GREEK SMALL LETTER ALPHA}" # windows-1253:E1 - "\N{GREEK SMALL LETTER BETA}" # windows-1253:E2 - "\N{GREEK SMALL LETTER GAMMA}" # windows-1253:E3 - "\N{GREEK SMALL LETTER DELTA}" # windows-1253:E4 - "\N{GREEK SMALL LETTER EPSILON}" # windows-1253:E5 - "\N{GREEK SMALL LETTER ZETA}" # windows-1253:E6 - "\N{GREEK SMALL LETTER ETA}" # windows-1253:E7 - "\N{GREEK SMALL LETTER THETA}" # windows-1253:E8 - "\N{GREEK SMALL LETTER IOTA}" # windows-1253:E9 - "\N{GREEK SMALL LETTER KAPPA}" # windows-1253:EA - "\N{GREEK SMALL LETTER LAMDA}" # windows-1253:EB - "\N{GREEK SMALL LETTER MU}" # windows-1253:EC - "\N{GREEK SMALL LETTER NU}" # windows-1253:ED - "\N{GREEK SMALL LETTER XI}" # windows-1253:EE - "\N{GREEK SMALL LETTER OMICRON}" # windows-1253:EF - "\N{CYRILLIC SMALL LETTER A}" # windows-1251:E0 - "\N{CYRILLIC SMALL LETTER BE}" # windows-1251:E1 - "\N{CYRILLIC SMALL LETTER VE}" # windows-1251:E2 - "\N{CYRILLIC SMALL LETTER GHE}" # windows-1251:E3 - "\N{CYRILLIC SMALL LETTER DE}" # windows-1251:E4 - "\N{CYRILLIC SMALL LETTER IE}" # windows-1251:E5 - "\N{CYRILLIC SMALL LETTER ZHE}" # windows-1251:E6 - "\N{CYRILLIC SMALL LETTER ZE}" # windows-1251:E7 - "\N{CYRILLIC SMALL LETTER I}" # windows-1251:E8 - "\N{CYRILLIC SMALL LETTER SHORT I}" # windows-1251:E9 - "\N{CYRILLIC SMALL LETTER KA}" # windows-1251:EA - "\N{CYRILLIC SMALL LETTER EL}" # windows-1251:EB - "\N{CYRILLIC SMALL LETTER EM}" # windows-1251:EC - "\N{CYRILLIC SMALL LETTER EN}" # windows-1251:ED - "\N{CYRILLIC SMALL LETTER O}" # windows-1251:EE - "\N{CYRILLIC SMALL LETTER PE}" # windows-1251:EF - ), + "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"), # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. # (Other leading bytes correspond only to unassigned codepoints) - "utf8_first_of_4": ( - "\N{LATIN SMALL LETTER D WITH STROKE}" # windows-1250:F0 - "\N{LATIN SMALL LETTER ETH}" # latin-1:F0 - "\N{LATIN SMALL LETTER G WITH BREVE}" # windows-1254:F0 - "\N{LATIN SMALL LETTER O WITH ACUTE}" # latin-1:F3 - "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1257:F0 - "\N{GREEK SMALL LETTER PI}" # windows-1253:F0 - "\N{GREEK SMALL LETTER SIGMA}" # windows-1253:F3 - "\N{CYRILLIC SMALL LETTER ER}" # windows-1251:F0 - "\N{CYRILLIC SMALL LETTER U}" # windows-1251:F3 - ), + "utf8_first_of_4": ("ðóđğπσру"), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # including a space standing in for 0xA0 "utf8_continuation": ( "\x80-\xbf" - "\N{SPACE}" # modification of latin-1:A0, NO-BREAK SPACE - "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 - "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF - "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC - "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 - "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 - "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA - "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C - "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A - "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA - "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D - "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F - "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F - "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E - "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF - "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C - "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 - "\N{LATIN SMALL LETTER AE}" # windows-1257:BF - "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 - "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE - "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 - "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 - "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA - "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C - "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A - "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA - "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D - "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F - "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E - "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF - "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C - "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 - "\N{CARON}" # windows-1250:A1 - "\N{BREVE}" # windows-1250:A2 - "\N{OGONEK}" # windows-1250:B2 - "\N{SMALL TILDE}" # windows-1252:98 - "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD - "\N{GREEK TONOS}" # windows-1253:B4 - "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 - "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 - "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 - "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 - "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA - "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC - "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE - "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF - "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 - "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 - "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 - "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA - "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD - "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 - "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF - "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 - "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A - "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C - "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E - "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D - "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 - "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F - "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 - "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 - "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 - "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA - "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE - "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 - "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF - "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC - "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A - "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C - "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E - "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D - "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 - "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F - "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 - "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 - "\N{EN DASH}" # windows-1252:96 - "\N{EM DASH}" # windows-1252:97 - "\N{HORIZONTAL BAR}" # windows-1253:AF - "\N{LEFT SINGLE QUOTATION MARK}" # windows-1252:91 - "\N{RIGHT SINGLE QUOTATION MARK}" # windows-1252:92 - "\N{SINGLE LOW-9 QUOTATION MARK}" # windows-1252:82 - "\N{LEFT DOUBLE QUOTATION MARK}" # windows-1252:93 - "\N{RIGHT DOUBLE QUOTATION MARK}" # windows-1252:94 - "\N{DOUBLE LOW-9 QUOTATION MARK}" # windows-1252:84 - "\N{DAGGER}" # windows-1252:86 - "\N{DOUBLE DAGGER}" # windows-1252:87 - "\N{BULLET}" # windows-1252:95 - "\N{HORIZONTAL ELLIPSIS}" # windows-1252:85 - "\N{PER MILLE SIGN}" # windows-1252:89 - "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B - "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B - "\N{EURO SIGN}" # windows-1252:80 - "\N{NUMERO SIGN}" # windows-1251:B9 - "\N{TRADE MARK SIGN}" # windows-1252:99 + "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" + "–—―‘’‚“”„†‡•…‰‹›€№™" + " " ), # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, # and don't usually stand for themselves when adjacent to mojibake. - # This excludes spaces, dashes, 'bullet', quotation marks, and ellipses. + # This excludes spaces, dashes, quotation marks, and ellipses. "utf8_continuation_strict": ( "\x80-\xbf" - "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 - "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF - "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC - "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 - "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 - "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA - "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C - "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A - "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA - "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D - "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F - "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F - "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E - "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF - "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C - "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 - "\N{LATIN SMALL LETTER AE}" # windows-1257:BF - "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 - "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE - "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 - "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 - "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA - "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C - "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A - "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA - "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D - "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F - "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E - "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF - "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C - "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 - "\N{CARON}" # windows-1250:A1 - "\N{BREVE}" # windows-1250:A2 - "\N{OGONEK}" # windows-1250:B2 - "\N{SMALL TILDE}" # windows-1252:98 - "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD - "\N{GREEK TONOS}" # windows-1253:B4 - "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 - "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 - "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 - "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 - "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA - "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC - "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE - "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF - "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 - "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 - "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 - "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA - "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD - "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 - "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF - "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 - "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A - "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C - "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E - "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D - "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 - "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F - "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 - "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 - "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 - "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA - "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE - "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 - "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF - "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC - "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A - "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C - "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E - "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D - "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 - "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F - "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 - "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 - "\N{DAGGER}" # windows-1252:86 - "\N{DOUBLE DAGGER}" # windows-1252:87 - "\N{PER MILLE SIGN}" # windows-1252:89 - "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B - "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B - "\N{EURO SIGN}" # windows-1252:80 - "\N{NUMERO SIGN}" # windows-1251:B9 - "\N{TRADE MARK SIGN}" # windows-1252:99 + "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" + "†‡•‰‹›€№™" ), } @@ -686,6 +308,8 @@ def _build_width_map() -> dict[int, str]: | [{utf8_first_of_4}] [{utf8_continuation}]{{3}} )+ - """.format(**UTF8_CLUES), +""".format( + **UTF8_CLUES + ), re.VERBOSE, ) diff --git a/ftfy/cli.py b/ftfy/cli.py index 16f32967..4148d1fc 100644 --- a/ftfy/cli.py +++ b/ftfy/cli.py @@ -1,13 +1,10 @@ """ A command-line utility for fixing text found in a file. """ - import os import sys -from pathlib import Path -from typing import Union -from ftfy import TextFixerConfig, __version__, fix_file +from ftfy import __version__, fix_file, TextFixerConfig ENCODE_ERROR_TEXT_UNIX = """ftfy error: Unfortunately, this output stream does not support Unicode. @@ -42,53 +39,57 @@ """ -def main() -> None: +def main(): """ Run ftfy as a command-line utility. """ import argparse parser = argparse.ArgumentParser( - description=f"ftfy (fixes text for you), version {__version__}" + description="ftfy (fixes text for you), version %s" % __version__ ) parser.add_argument( - "filename", - default="-", - nargs="?", - help="The file whose Unicode is to be fixed. Defaults to -, meaning standard input.", + 'filename', + default='-', + nargs='?', + help='The file whose Unicode is to be fixed. Defaults ' + 'to -, meaning standard input.', ) parser.add_argument( - "-o", - "--output", + '-o', + '--output', type=str, - default="-", - help="The file to output to. Defaults to -, meaning standard output.", + default='-', + help='The file to output to. Defaults to -, meaning ' 'standard output.', ) parser.add_argument( - "-g", - "--guess", - action="store_true", - help="Ask ftfy to guess the encoding of your input. This is risky. Overrides -e.", + '-g', + '--guess', + action='store_true', + help="Ask ftfy to guess the encoding of your input. " + "This is risky. Overrides -e.", ) parser.add_argument( - "-e", - "--encoding", + '-e', + '--encoding', type=str, - default="utf-8", - help="The encoding of the input. Defaults to UTF-8.", + default='utf-8', + help='The encoding of the input. Defaults to UTF-8.', ) parser.add_argument( - "-n", - "--normalization", + '-n', + '--normalization', type=str, - default="NFC", - help='The normalization of Unicode to apply. Defaults to NFC. Can be "none".', + default='NFC', + help='The normalization of Unicode to apply. ' + 'Defaults to NFC. Can be "none".', ) parser.add_argument( - "--preserve-entities", - action="store_true", + '--preserve-entities', + action='store_true', help="Leave HTML entities as they are. The default " - "is to decode them, as long as no HTML tags have appeared in the file.", + "is to decode them, as long as no HTML tags " + "have appeared in the file.", ) args = parser.parse_args() @@ -97,36 +98,45 @@ def main() -> None: if args.guess: encoding = None - if args.filename == "-": + if args.filename == '-': # Get a standard input stream made of bytes, so we can decode it as # whatever encoding is necessary. file = sys.stdin.buffer else: - file = Path(args.filename).open("rb") + file = open(args.filename, 'rb') - if args.output == "-": + if args.output == '-': outfile = sys.stdout else: if os.path.realpath(args.output) == os.path.realpath(args.filename): sys.stderr.write(SAME_FILE_ERROR_TEXT) sys.exit(1) - outfile = Path(args.output).open("w", encoding="utf-8") + outfile = open(args.output, 'w', encoding='utf-8') normalization = args.normalization - if normalization.lower() == "none": + if normalization.lower() == 'none': normalization = None - unescape_html: Union[str, bool] - unescape_html = False if args.preserve_entities else "auto" + if args.preserve_entities: + unescape_html = False + else: + unescape_html = 'auto' - config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization) + config = TextFixerConfig( + unescape_html=unescape_html, + normalization=normalization + ) try: - for line in fix_file(file, encoding=encoding, config=config): + for line in fix_file( + file, + encoding=encoding, + config=config + ): try: outfile.write(line) except UnicodeEncodeError: - if sys.platform == "win32": + if sys.platform == 'win32': sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) else: sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) @@ -136,5 +146,5 @@ def main() -> None: sys.exit(1) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/ftfy/fixes.py b/ftfy/fixes.py index 41d3c2f8..d93cbebb 100644 --- a/ftfy/fixes.py +++ b/ftfy/fixes.py @@ -14,11 +14,8 @@ import html import re import warnings -from re import Match -from typing import Any import ftfy -from ftfy.badness import is_bad from ftfy.chardata import ( ALTERED_UTF8_RE, C1_CONTROL_RE, @@ -33,44 +30,41 @@ WIDTH_MAP, ) +from ftfy.badness import is_bad + -def fix_encoding_and_explain(text: str) -> Any: +def fix_encoding_and_explain(text): """ Deprecated copy of `ftfy.fix_encoding_and_explain()`. """ warnings.warn( "`fix_encoding_and_explain()` has moved to the main module of ftfy.", DeprecationWarning, - stacklevel=2, ) return ftfy.fix_encoding_and_explain(text) -def fix_encoding(text: str) -> str: +def fix_encoding(text): """ Deprecated copy of `ftfy.fix_encoding()`. """ warnings.warn( - "`fix_encoding()` has moved to the main module of ftfy.", - DeprecationWarning, - stacklevel=2, + "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning ) return ftfy.fix_encoding(text) -def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: +def apply_plan(text, plan): """ Deprecated copy of `ftfy.apply_plan()`. """ warnings.warn( - "`apply_plan()` has moved to the main module of ftfy.", - DeprecationWarning, - stacklevel=2, + "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning ) return ftfy.apply_plan(text, plan) -def _unescape_fixup(match: Match[str]) -> str: +def _unescape_fixup(match): """ Replace one matched HTML entity with the character it represents, if possible. @@ -79,7 +73,7 @@ def _unescape_fixup(match: Match[str]) -> str: if text in HTML_ENTITIES: return HTML_ENTITIES[text] elif text.startswith("&#"): - unescaped: str = html.unescape(text) + unescaped = html.unescape(text) # If html.unescape only decoded part of the string, that's not what # we want. The semicolon should be consumed. @@ -91,7 +85,7 @@ def _unescape_fixup(match: Match[str]) -> str: return text -def unescape_html(text: str) -> str: +def unescape_html(text): """ Decode HTML entities and character references, including some nonstandard ones written in all-caps. @@ -142,7 +136,7 @@ def unescape_html(text: str) -> str: ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])") -def remove_terminal_escapes(text: str) -> str: +def remove_terminal_escapes(text): r""" Strip out "ANSI" terminal escape sequences, such as those that produce colored text on Unix. @@ -155,7 +149,7 @@ def remove_terminal_escapes(text: str) -> str: return ANSI_RE.sub("", text) -def uncurl_quotes(text: str) -> str: +def uncurl_quotes(text): r""" Replace curly quotation marks with straight equivalents. @@ -165,7 +159,7 @@ def uncurl_quotes(text: str) -> str: return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text)) -def fix_latin_ligatures(text: str) -> str: +def fix_latin_ligatures(text): """ Replace single-character ligatures of Latin letters, such as 'fi', with the characters that they contain, as in 'fi'. Latin ligatures are usually not @@ -183,7 +177,7 @@ def fix_latin_ligatures(text: str) -> str: return text.translate(LIGATURES) -def fix_character_width(text: str) -> str: +def fix_character_width(text): """ The ASCII characters, katakana, and Hangul characters have alternate "halfwidth" or "fullwidth" forms that help text line up in a grid. @@ -203,7 +197,7 @@ def fix_character_width(text: str) -> str: return text.translate(WIDTH_MAP) -def fix_line_breaks(text: str) -> str: +def fix_line_breaks(text): r""" Convert all line breaks to Unix style. @@ -259,7 +253,7 @@ def fix_line_breaks(text: str) -> str: SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]") -def convert_surrogate_pair(match: Match[str]) -> str: +def convert_surrogate_pair(match): """ Convert a surrogate pair to the single codepoint it represents. @@ -271,7 +265,7 @@ def convert_surrogate_pair(match: Match[str]) -> str: return chr(codept) -def fix_surrogates(text: str) -> str: +def fix_surrogates(text): """ Replace 16-bit surrogate codepoints with the characters they represent (when properly paired), or with \ufffd otherwise. @@ -294,7 +288,7 @@ def fix_surrogates(text: str) -> str: return text -def remove_control_chars(text: str) -> str: +def remove_control_chars(text): """ Remove various control characters that you probably didn't intend to be in your text. Many of these characters appear in the table of "Characters not @@ -327,7 +321,7 @@ def remove_control_chars(text: str) -> str: return text.translate(CONTROL_CHARS) -def remove_bom(text: str) -> str: +def remove_bom(text): r""" Remove a byte-order mark that was accidentally decoded as if it were part of the text. @@ -352,7 +346,7 @@ def remove_bom(text: str) -> str: ) -def decode_escapes(text: str) -> str: +def decode_escapes(text): r""" Decode backslashed escape sequences, including \\x, \\u, and \\U character references, even in the presence of other Unicode. @@ -383,7 +377,7 @@ def decode_escapes(text: str) -> str: "unicode-escape" to work correctly. """ - def decode_match(match: Match[str]) -> str: + def decode_match(match): "Given a regex match, decode the escape sequence it contains." return codecs.decode(match.group(0), "unicode-escape") @@ -416,7 +410,7 @@ def decode_match(match: Match[str]) -> str: A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )") -def restore_byte_a0(byts: bytes) -> bytes: +def restore_byte_a0(byts): """ Some mojibake has been additionally altered by a process that said "hmm, byte A0, that's basically a space!" and replaced it with an ASCII space. @@ -432,14 +426,14 @@ def restore_byte_a0(byts: bytes) -> bytes: """ byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts) - def replacement(match: Match[bytes]) -> bytes: + def replacement(match): "The function to apply when this regex matches." return match.group(0).replace(b"\x20", b"\xa0") return ALTERED_UTF8_RE.sub(replacement, byts) -def replace_lossy_sequences(byts: bytes) -> bytes: +def replace_lossy_sequences(byts): """ This function identifies sequences where information has been lost in a "sloppy" codec, indicated by byte 1A, and if they would otherwise look @@ -475,10 +469,10 @@ def replace_lossy_sequences(byts: bytes) -> bytes: This is used as a transcoder within `fix_encoding`. """ - return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts) + return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts) -def decode_inconsistent_utf8(text: str) -> str: +def decode_inconsistent_utf8(text): """ Sometimes, text from one encoding ends up embedded within text from a different one. This is common enough that we need to be able to fix it. @@ -486,7 +480,7 @@ def decode_inconsistent_utf8(text: str) -> str: This is used as a transcoder within `fix_encoding`. """ - def fix_embedded_mojibake(match: Match[str]) -> str: + def fix_embedded_mojibake(match): substr = match.group(0) # Require the match to be shorter, so that this doesn't recurse infinitely @@ -498,11 +492,11 @@ def fix_embedded_mojibake(match: Match[str]) -> str: return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text) -def _c1_fixer(match: Match[str]) -> str: +def _c1_fixer(match): return match.group(0).encode("latin-1").decode("sloppy-windows-1252") -def fix_c1_controls(text: str) -> str: +def fix_c1_controls(text): """ If text still contains C1 control characters, treat them as their Windows-1252 equivalents. This matches what Web browsers do. diff --git a/ftfy/formatting.py b/ftfy/formatting.py index 42955588..36a7847d 100644 --- a/ftfy/formatting.py +++ b/ftfy/formatting.py @@ -5,13 +5,10 @@ We used to have our own implementation here, but now we mostly rely on the 'wcwidth' library. """ - from unicodedata import normalize from wcwidth import wcswidth, wcwidth -from ftfy.fixes import remove_terminal_escapes - def character_width(char: str) -> int: r""" @@ -31,7 +28,7 @@ def character_width(char: str) -> int: >>> character_width('\n') -1 """ - return int(wcwidth(char)) + return wcwidth(char) def monospaced_width(text: str) -> int: @@ -50,7 +47,7 @@ def monospaced_width(text: str) -> int: >>> len('ちゃぶ台返し') 6 >>> monospaced_width('owl\N{SOFT HYPHEN}flavored') - 11 + 12 >>> monospaced_width('example\x80') -1 @@ -62,21 +59,13 @@ def monospaced_width(text: str) -> int: 6 >>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161') 6 - - The word "blue" with terminal escapes to make it blue still takes up only - 4 characters, when shown as intended. - >>> monospaced_width('\x1b[34mblue\x1b[m') - 4 """ # NFC-normalize the text first, so that we don't need special cases for # Hangul jamo. - # - # Remove terminal escapes before calculating width, because if they are - # displayed as intended, they will have zero width. - return int(wcswidth(remove_terminal_escapes(normalize("NFC", text)))) + return wcswidth(normalize('NFC', text)) -def display_ljust(text: str, width: int, fillchar: str = " ") -> str: +def display_ljust(text, width, fillchar=' '): """ Return `text` left-justified in a Unicode string whose display width, in a monospaced terminal, should be at least `width` character cells. @@ -99,8 +88,7 @@ def display_ljust(text: str, width: int, fillchar: str = " ") -> str: correct if you're viewing this code or documentation in a Web browser. """ if character_width(fillchar) != 1: - msg = "The padding character must have display width 1" - raise ValueError(msg) + raise ValueError("The padding character must have display width 1") text_width = monospaced_width(text) if text_width == -1: @@ -111,7 +99,7 @@ def display_ljust(text: str, width: int, fillchar: str = " ") -> str: return text + fillchar * padding -def display_rjust(text: str, width: int, fillchar: str = " ") -> str: +def display_rjust(text, width, fillchar=' '): """ Return `text` right-justified in a Unicode string whose display width, in a monospaced terminal, should be at least `width` character cells. @@ -130,8 +118,7 @@ def display_rjust(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒▒▒▒▒ちゃぶ台返し """ if character_width(fillchar) != 1: - msg = "The padding character must have display width 1" - raise ValueError(msg) + raise ValueError("The padding character must have display width 1") text_width = monospaced_width(text) if text_width == -1: @@ -141,7 +128,7 @@ def display_rjust(text: str, width: int, fillchar: str = " ") -> str: return fillchar * padding + text -def display_center(text: str, width: int, fillchar: str = " ") -> str: +def display_center(text, width, fillchar=' '): """ Return `text` centered in a Unicode string whose display width, in a monospaced terminal, should be at least `width` character cells. The rest @@ -156,8 +143,7 @@ def display_center(text: str, width: int, fillchar: str = " ") -> str: ▒▒▒▒ちゃぶ台返し▒▒▒▒ """ if character_width(fillchar) != 1: - msg = "The padding character must have display width 1" - raise ValueError(msg) + raise ValueError("The padding character must have display width 1") text_width = monospaced_width(text) if text_width == -1: diff --git a/ftfy/py.typed b/ftfy/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 278ee780..00000000 --- a/mypy.ini +++ /dev/null @@ -1,21 +0,0 @@ -[mypy] -files = ftfy -check_untyped_defs = True -disallow_any_generics = True -disallow_incomplete_defs = False -disallow_subclassing_any = True -disallow_untyped_calls = False -disallow_untyped_decorators = False -disallow_untyped_defs = False -no_implicit_optional = True -no_implicit_reexport = False -strict_equality = True -warn_redundant_casts = True -warn_return_any = True -warn_unused_configs = True -warn_unused_ignores = True -python_version = 3.9 - -[mypy-wcwidth] -ignore_missing_imports = True - diff --git a/notebook/excel-export.png b/notebook/excel-export.png deleted file mode 100755 index 16063543..00000000 Binary files a/notebook/excel-export.png and /dev/null differ diff --git a/notebook/ftfy talk.ipynb b/notebook/ftfy talk.ipynb deleted file mode 100644 index 5071a5c9..00000000 --- a/notebook/ftfy talk.ipynb +++ /dev/null @@ -1,1314 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "handled-recruitment", - "metadata": { - "slideshow": { - "slide_type": "skip" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'6.0.3'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import ftfy\n", - "ftfy.__version__" - ] - }, - { - "cell_type": "markdown", - "id": "eleven-caution", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Mojibake!\n", - "\n", - "## What the h—ck happened to this text?\n", - "\n", - "Robyn Speer" - ] - }, - { - "cell_type": "markdown", - "id": "confused-advertiser", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Mojibake is when text ends up with the wrong Unicode characters due to an encoding mistake.\n", - "\n", - "- It's Japanese for \"ghost characters\"\n", - "- ftfy is my Python library that fixes them. (get it with `pip install ftfy`)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "disabled-ridge", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'merci de télécharger le plug-in'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ftfy.fix_text(\"merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in\")" - ] - }, - { - "cell_type": "markdown", - "id": "focused-facial", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## The mascot of ftfy" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "introductory-supervisor", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "\"(ง'̀⌣'́)ง\"" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ftfy.fix_text(\"(Ã\\xa0¸‡'̀⌣'ÃŒÂ\\x81)Ã\\xa0¸‡\")" - ] - }, - { - "cell_type": "markdown", - "id": "following-apple", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "This little guy, and in fact every example in this talk, comes from mojibake I found in the wild -- usually on Twitter or in the OSCAR Web Corpus.\n", - "\n", - "> **Side note**: there are a lot of tangents I would like to go off on, but this is a 10-minute talk and there's no time for tangents. So I'll be sprinkling in these side notes that I'll be skipping over as I present the talk.\n", - ">\n", - "> If you're interested in them, I suggest pausing the video, or reading the Jupyter Notebook version of this talk that I'll be linking later." - ] - }, - { - "cell_type": "markdown", - "id": "concrete-consent", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "It used to be that 1 byte = 1 character, so there were at most 256 possible characters of text that could be shown on a computer. Here's some quick code to see them:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "horizontal-playlist", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "import blessings\n", - "term = blessings.Terminal() # enable colorful text\n", - "\n", - "def displayable_codepoint(codepoint, encoding):\n", - " char = bytes([codepoint]).decode(encoding, 'replace')\n", - " if char == '�':\n", - " return '▓▓'\n", - " elif not char.isprintable():\n", - " return '░░'\n", - " else:\n", - " return char\n", - "\n", - "def show_char_table(encoding):\n", - " print(f\"encoding: {encoding}\\n 0 1 2 3 4 5 6 7 8 9 a b c d e f\\n\")\n", - " for row in range(16):\n", - " print(f\"{row*16:>02x}\", end=\" \")\n", - " if row == 0:\n", - " print(ftfy.formatting.display_center(term.green(\" control characters \"), 32, \"░\"))\n", - " elif row == 8 and encoding == 'latin-1':\n", - " print(ftfy.formatting.display_center(term.green(\" here be dragons \"), 32, \"░\"))\n", - " else:\n", - " for col in range(16):\n", - " char = displayable_codepoint(row * 16 + col, encoding)\n", - " print(f\"{char:<2}\", end=\"\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "id": "worthy-broad", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# ASCII\n", - "\n", - "- In the '60s, we agreed that 128 of these bytes should have well-defined meanings as characters of text. That's ASCII\n", - "- It worked pretty well for monolingual Americans" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "interim-attribute", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: ascii\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "90 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "a0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "b0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "c0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "d0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "e0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n", - "f0 ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n" - ] - } - ], - "source": [ - "show_char_table(\"ascii\")" - ] - }, - { - "cell_type": "markdown", - "id": "starting-velvet", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "- Basically everyone decided the other 128 bytes should be characters too\n", - "- So people started talking about \"extended ASCII\" which means \"whatever my computer does with the other bytes\"\n", - "- A different computer, in a different country or with a different OS, would do something different\n", - "- This is how mojibake started\n", - "\n", - "Let's take a look at the different ways that ASCII got \"extended\", which are now called _codepages_." - ] - }, - { - "cell_type": "markdown", - "id": "flying-oriental", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "Here's Latin-1. This encoding was used on a lot of UNIX-like systems before they switched to UTF-8, but probably the biggest reason you see it is that it's the first 256 characters of Unicode.\n", - "\n", - "If you don't know about decoding text and you just replace each byte with the Unicode character with the same number, Latin-1 is the encoding you get by accident." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "european-ethics", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: latin-1\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 ░░░░░░░\u001b[32m here be dragons \u001b[m░░░░░░░░\n", - "90 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "a0 ░░¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ░░® ¯ \n", - "b0 ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿ \n", - "c0 À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï \n", - "d0 Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß \n", - "e0 à á â ã ä å æ ç è é ê ë ì í î ï \n", - "f0 ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ \n" - ] - } - ], - "source": [ - "\n", - "show_char_table('latin-1')" - ] - }, - { - "cell_type": "markdown", - "id": "editorial-clinton", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "source": [ - "## Dragons? 🐉\n", - "\n", - "I labeled rows 80 and 90 as \"here be dragons\" because they're full of control characters, but nobody ever agreed on what they do.\n", - "\n", - "Text that includes these codepoints can be shown as nearly anything, including little boxes, characters from the Windows-1252 character set, or just messing up your whole terminal window.\n", - "\n", - "> Side note: Control character 85 was an attempt to resolve the war between Unix line breaks (`0a`), Windows line breaks (`0d 0a`), and Mac Classic line breaks (`0d`), by introducing an _entirely new_ line break that wasn't even in ASCII.\n", - ">\n", - "> This is hilarious in retrospect. Clearly nobody would ever repeat that terrible idea, except the Unicode Consortium, who did it twice more.\n", - ">\n", - "> Anyway, ftfy has a function that turns all of these into byte `0a`." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "baking-notebook", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: windows-1252\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 € ▓▓‚ ƒ „ … † ‡ ˆ ‰ Š ‹ Œ ▓▓Ž ▓▓\n", - "90 ▓▓‘ ’ “ ” • – — ˜ ™ š › œ ▓▓ž Ÿ \n", - "a0 ░░¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ░░® ¯ \n", - "b0 ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿ \n", - "c0 À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï \n", - "d0 Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß \n", - "e0 à á â ã ä å æ ç è é ê ë ì í î ï \n", - "f0 ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ \n" - ] - } - ], - "source": [ - "show_char_table('windows-1252')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "affiliated-gentleman", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: cp437\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 Ç ü é â ä à å ç ê ë è ï î ì Ä Å \n", - "90 É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ \n", - "a0 á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » \n", - "b0 ░ ▒ ▓ │ ┤ ╡ ╢ ╖ ╕ ╣ ║ ╗ ╝ ╜ ╛ ┐ \n", - "c0 └ ┴ ┬ ├ ─ ┼ ╞ ╟ ╚ ╔ ╩ ╦ ╠ ═ ╬ ╧ \n", - "d0 ╨ ╤ ╥ ╙ ╘ ╒ ╓ ╫ ╪ ┘ ┌ █ ▄ ▌ ▐ ▀ \n", - "e0 α ß Γ π Σ σ µ τ Φ Θ Ω δ ∞ φ ε ∩ \n", - "f0 ≡ ± ≥ ≤ ⌠ ⌡ ÷ ≈ ° ∙ · √ ⁿ ² ■ ░░\n" - ] - } - ], - "source": [ - "show_char_table('cp437')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "northern-string", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: macroman\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 Ä Å Ç É Ñ Ö Ü á à â ä ã å ç é è \n", - "90 ê ë í ì î ï ñ ó ò ô ö õ ú ù û ü \n", - "a0 † ° ¢ £ § • ¶ ß ® © ™ ´ ¨ ≠ Æ Ø \n", - "b0 ∞ ± ≤ ≥ ¥ µ ∂ ∑ ∏ π ∫ ª º Ω æ ø \n", - "c0 ¿ ¡ ¬ √ ƒ ≈ ∆ « » … ░░À Ã Õ Œ œ \n", - "d0 – — “ ” ‘ ’ ÷ ◊ ÿ Ÿ ⁄ € ‹ › fi fl \n", - "e0 ‡ · ‚ „ ‰ Â Ê Á Ë È Í Î Ï Ì Ó Ô \n", - "f0 ░░Ò Ú Û Ù ı ˆ ˜ ¯ ˘ ˙ ˚ ¸ ˝ ˛ ˇ \n" - ] - } - ], - "source": [ - "show_char_table('macroman')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "intelligent-equipment", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "encoding: windows-1251\n", - " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n", - "\n", - "00 ░░░░░░\u001b[32m control characters \u001b[m░░░░░░\n", - "10 ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░\n", - "20 ! \" # $ % & ' ( ) * + , - . / \n", - "30 0 1 2 3 4 5 6 7 8 9 : ; < = > ? \n", - "40 @ A B C D E F G H I J K L M N O \n", - "50 P Q R S T U V W X Y Z [ \\ ] ^ _ \n", - "60 ` a b c d e f g h i j k l m n o \n", - "70 p q r s t u v w x y z { | } ~ ░░\n", - "80 Ђ Ѓ ‚ ѓ „ … † ‡ € ‰ Љ ‹ Њ Ќ Ћ Џ \n", - "90 ђ ‘ ’ “ ” • – — ▓▓™ љ › њ ќ ћ џ \n", - "a0 ░░Ў ў Ј ¤ Ґ ¦ § Ё © Є « ¬ ░░® Ї \n", - "b0 ° ± І і ґ µ ¶ · ё № є » ј Ѕ ѕ ї \n", - "c0 А Б В Г Д Е Ж З И Й К Л М Н О П \n", - "d0 Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я \n", - "e0 а б в г д е ж з и й к л м н о п \n", - "f0 р с т у ф х ц ч ш щ ъ ы ь э ю я \n" - ] - } - ], - "source": [ - "show_char_table(\"windows-1251\")" - ] - }, - { - "cell_type": "markdown", - "id": "unique-airplane", - "metadata": { - "slideshow": { - "slide_type": "skip" - } - }, - "source": [ - "## So text wasn't portable between different computers" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "compound-columbia", - "metadata": { - "slideshow": { - "slide_type": "skip" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Plus Áa change, plus cíest la mÍme chose'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "phrase = \"Plus ça change, plus c’est la même chose\"\n", - "\n", - "phrase.encode('windows-1252').decode('macroman')" - ] - }, - { - "cell_type": "markdown", - "id": "identical-welsh", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Vintage '90s mojibake\n", - "\n", - "Maybe you've seen a \".NFO\" file like this, telling you about the nice people who provided you with a DOS game for free:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "scenic-norman", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "crack_nfo = r\"\"\"\n", - " ───── ────────────── ───────────────── ────────────── ─────────────── ────\n", - " ▄█████▄ ▀█████▄ ████▄ ▀█████████ ████▄ ▄█████▄▀████▄ ▀██████▄ ▀████▄ ▄██▄\n", - " ████████ █████▀ ██████ ▀████████ ██████ ▀███████▌██████ ███████ ████▄█████\n", - " ███ ▀███▌█▀ ▄▄▄█▌▀█████ ▌████ ▄▄▄█▀▐████ ███▀▌▀█▌█▌ ███▌ ██▀▐████ ██████████\n", - " ███ ▐██▌▌ ████ ▌████ ████ ████ ████ ██▌ ▌▐▌█▌ ████ ██ ████ ██▌▐█▌▐███\n", - " ███ ▄███▌▄▄ ████ ████ ████ ████ ▄████ ██▌ █▄▄█▌ ████ ██ ▄████ ██ ▐█▌ ███\n", - " ████████ ██ ████ ████ ████ ████ █████ ██▌ ▀▀██▌▐███▀ ██▐█████ ██ █ ███\n", - " ██████▀ ▄▀▀ ████ ████ ████ ████ ▀████ ██▌███▄▐██▄▀▀ ▄███ ▀████ ██ ▄ ███\n", - " ███▀ ▄▄██ ████ ▐████ ████ ████ ████ ██▌▐▐██▌█▀██▄ ████ ████ ██ ███\n", - " ███ █████▄ ▀▀█ ████▀ ▐████ ░███ ▐████ ███▄▐██▌█▌▐██▌▐███ ▐███░ ██▌ r███\n", - " ░██ █████████▄ ▐██▌▄██████ ▒░██ █████ ███████▌█▌▐███ ███ ███░▒ ███ o██░\n", - " ▒░█ ▀███████▀ ▀ ███▐████▀ ▓▒░█ ▐███▌ ▀██████▐█▌▐███ ███ ▐█░▒▓ ██▌ y█░▒\n", - " - ▌─────▐▀─ ▄▄▄█ ── ▀▀ ───── ────── ▀▀▀ ─ ▐▀▀▀▀ ▀▀ ████ ──── ▀█▀ ─ ▀ ────▐ ─\n", - "\n", - " ╓────────────────────────[ RELEASE INFORMATION ]───────────────────────╖\n", - "╓────────────────────────────────────────────────────────────────────────────╖\n", - "║ -/\\- THE EVEN MORE INCREDIBLE MACHINE FOR *DOS* FROM SIERRA/DYNAMIX -/\\- ║\n", - "╙────────────────────────────────────────────────────────────────────────────╜\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "graphic-treasurer", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "> **Side note**: It's funny that most of the things we call \"ASCII art\" aren't ASCII." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "valuable-ethernet", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " ÄÄÄÄÄ ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ ÄÄÄÄ\n", - " ÜÛÛÛÛÛÜ ßÛÛÛÛÛÜ ÛÛÛÛÜ ßÛÛÛÛÛÛÛÛÛ ÛÛÛÛÜ ÜÛÛÛÛÛÜßÛÛÛÛÜ ßÛÛÛÛÛÛÜ ßÛÛÛÛÜ ÜÛÛÜ\n", - " ÛÛÛÛÛÛÛÛ ÛÛÛÛÛß ÛÛÛÛÛÛ ßÛÛÛÛÛÛÛÛ ÛÛÛÛÛÛ ßÛÛÛÛÛÛÛÝÛÛÛÛÛÛ ÛÛÛÛÛÛÛ ÛÛÛÛÜÛÛÛÛÛ\n", - " ÛÛÛ ßÛÛÛÝÛß ÜÜÜÛÝßÛÛÛÛÛ ÝÛÛÛÛ ÜÜÜÛßÞÛÛÛÛ ÛÛÛßÝßÛÝÛÝ ÛÛÛÝ ÛÛßÞÛÛÛÛ ÛÛÛÛÛÛÛÛÛÛ\n", - " ÛÛÛ ÞÛÛÝÝ ÛÛÛÛ ÝÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÝ ÝÞÝÛÝ ÛÛÛÛ ÛÛ ÛÛÛÛ ÛÛÝÞÛÝÞÛÛÛ\n", - " ÛÛÛ ÜÛÛÛÝÜÜ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÜÛÛÛÛ ÛÛÝ ÛÜÜÛÝ ÛÛÛÛ ÛÛ ÜÛÛÛÛ ÛÛ ÞÛÝ ÛÛÛ\n", - " ÛÛÛÛÛÛÛÛ ÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛÛ ÛÛÝ ßßÛÛÝÞÛÛÛß ÛÛÞÛÛÛÛÛ ÛÛ Û ÛÛÛ\n", - " ÛÛÛÛÛÛß Üßß ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ßÛÛÛÛ ÛÛÝÛÛÛÜÞÛÛÜßß ÜÛÛÛ ßÛÛÛÛ ÛÛ Ü ÛÛÛ\n", - " ÛÛÛß ÜÜÛÛ ÛÛÛÛ ÞÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÛÛ ÛÛÝÞÞÛÛÝÛßÛÛÜ ÛÛÛÛ ÛÛÛÛ ÛÛ ÛÛÛ\n", - " ÛÛÛ ÛÛÛÛÛÜ ßßÛ ÛÛÛÛß ÞÛÛÛÛ °ÛÛÛ ÞÛÛÛÛ ÛÛÛÜÞÛÛÝÛÝÞÛÛÝÞÛÛÛ ÞÛÛÛ° ÛÛÝ rÛÛÛ\n", - " °ÛÛ ÛÛÛÛÛÛÛÛÛÜ ÞÛÛÝÜÛÛÛÛÛÛ ±°ÛÛ ÛÛÛÛÛ ÛÛÛÛÛÛÛÝÛÝÞÛÛÛ ÛÛÛ ÛÛÛ°± ÛÛÛ oÛÛ°\n", - " ±°Û ßÛÛÛÛÛÛÛß ß ÛÛÛÞÛÛÛÛß ²±°Û ÞÛÛÛÝ ßÛÛÛÛÛÛÞÛÝÞÛÛÛ ÛÛÛ ÞÛ°±² ÛÛÝ yÛ°±\n", - " - ÝÄÄÄÄÄÞßÄ ÜÜÜÛ ÄÄ ßß ÄÄÄÄÄ ÄÄÄÄÄÄ ßßß Ä Þßßßß ßß ÛÛÛÛ ÄÄÄÄ ßÛß Ä ß ÄÄÄÄÞ Ä\n", - "\n", - " ÖÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ[ RELEASE INFORMATION ]ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ·\n", - "ÖÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ·\n", - "º -/\\- THE EVEN MORE INCREDIBLE MACHINE FOR *DOS* FROM SIERRA/DYNAMIX -/\\- º\n", - "ÓÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄĽ\n", - "\n" - ] - } - ], - "source": [ - "print(crack_nfo.encode('cp437').decode('windows-1252'))" - ] - }, - { - "cell_type": "markdown", - "id": "alpha-ticket", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Here comes UTF-8\n", - "\n", - "Instead of using the other 128 bytes for 128 more characters, what if we used them as a variable-length encoding for the whole rest of Unicode?\n", - "\n", - "There were other variable-length encodings, but UTF-8 is well-designed.\n", - "\n", - "- It leaves ASCII as ASCII\n", - "- It doesn't overlap with ASCII _ever_: ASCII bytes always exclusively stand for the characters you expect\n", - "- It's self-synchronizing, so you can tell where each character starts and ends even with no context\n", - "\n", - "Everyone recognized how good this idea was and switched every system to UTF-8. Encoding problems were solved forever, the end.\n", - "\n", - "I'm kidding. UTF-8 is great but not everyone adopted it, especially not Microsoft who had just done _all that work_ to switch Windows APIs to UTF-16, which nobody likes. So now we have more kinds of mojibake." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "textile-abuse", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "l = 6c\n", - "’ = e2 80 99\n", - "H = 48\n", - "ô = c3 b4\n", - "p = 70\n", - "i = 69\n", - "t = 74\n", - "a = 61\n", - "l = 6c\n" - ] - } - ], - "source": [ - "# Code to look at the encoding of each character in UTF-8\n", - "\n", - "def show_utf8(text):\n", - " for char in text:\n", - " char_bytes = char.encode('utf-8')\n", - " byte_sequence = ' '.join([f\"{byte:>02x}\" for byte in char_bytes])\n", - " print(f\"{char} = {byte_sequence}\")\n", - "\n", - "text = \"l’Hôpital\"\n", - "show_utf8(text)" - ] - }, - { - "cell_type": "markdown", - "id": "worse-calculation", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## What happens when not everyone is on board with UTF-8" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "adjustable-northeast", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "b'l\\xe2\\x80\\x99H\\xc3\\xb4pital'" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text.encode('utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "printable-practitioner", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "l’Hôpital\n" - ] - } - ], - "source": [ - "print(text.encode('utf-8').decode('windows-1252'))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "eleven-radius", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "l’Hôpital\n" - ] - } - ], - "source": [ - "print(text.encode('utf-8').decode('windows-1252').encode('utf-8').decode('windows-1252'))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "sophisticated-volunteer", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "outputs": [], - "source": [ - "EXAMPLES = [\n", - " \"Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in\",\n", - " \n", - " \"The Mona Lisa doesn’t have eyebrows.\",\n", - " \n", - " \"I just figured out how to tweet emojis! â\\x9a½í\\xa0½í¸\\x80í\\xa0½í¸\\x81í\\xa0½í¸\"\n", - " \"\\x82í\\xa0½í¸\\x86í\\xa0½í¸\\x8eí\\xa0½í¸\\x8eí\\xa0½í¸\\x8eí\\xa0½í¸\\x8e\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "surrounded-makeup", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## We can recognize UTF-8 mojibake by its distinct patterns\n", - "\n", - "The pattern of bytes that makes UTF-8 self-synchronizing is a pattern we can recognize even via a different encoding.\n", - "\n", - "The example `doesn’t` is recognizable as UTF-8 / Windows-1252 mojibake, for example. `t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger` is recognizable as UTF-8 / MacRoman mojibake.\n", - "\n", - "When we see such a pattern, we encode as the appropriate other encoding, then decode as UTF-8." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "sunrise-engineering", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Merci de t‚àö¬©l‚àö¬©charger le plug-in'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "EXAMPLES[0].encode('macroman').decode('utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "portuguese-marble", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Merci de t√©l√©charger le plug-in'" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "_.encode('macroman').decode('utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "angry-floor", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Merci de télécharger le plug-in'" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "_.encode('macroman').decode('utf-8')" - ] - }, - { - "cell_type": "markdown", - "id": "historic-cameroon", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## This is a job that the computer can do for us" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "sunrise-madagascar", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [], - "source": [ - "# Some code to format the output of ftfy\n", - "\n", - "from ftfy import fix_and_explain\n", - "from pprint import pprint\n", - "\n", - "def show_explanation(text):\n", - " print(f\"Original: {text}\")\n", - " fixed, expl = fix_and_explain(text)\n", - " print(f\" Fixed: {fixed}\\n\")\n", - " pprint(expl)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "adjusted-civilization", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in\n", - " Fixed: Merci de télécharger le plug-in\n", - "\n", - "[('encode', 'macroman'),\n", - " ('decode', 'utf-8'),\n", - " ('encode', 'macroman'),\n", - " ('decode', 'utf-8'),\n", - " ('encode', 'macroman'),\n", - " ('decode', 'utf-8')]\n" - ] - } - ], - "source": [ - "show_explanation(EXAMPLES[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "attended-soviet", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: The Mona Lisa doesn’t have eyebrows.\n", - " Fixed: The Mona Lisa doesn't have eyebrows.\n", - "\n", - "[('encode', 'sloppy-windows-1252'),\n", - " ('decode', 'utf-8'),\n", - " ('encode', 'sloppy-windows-1252'),\n", - " ('decode', 'utf-8'),\n", - " ('encode', 'sloppy-windows-1252'),\n", - " ('decode', 'utf-8'),\n", - " ('apply', 'uncurl_quotes')]\n" - ] - } - ], - "source": [ - "show_explanation(EXAMPLES[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "yellow-running", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎\n", - " Fixed: I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎\n", - "\n", - "[('encode', 'latin-1'), ('decode', 'utf-8-variants')]\n" - ] - } - ], - "source": [ - "show_explanation(EXAMPLES[2])" - ] - }, - { - "cell_type": "markdown", - "id": "laden-trick", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "source": [ - "> **Side note**: ftfy adds encodings to Python like `sloppy-windows-1252` and `utf-8-variants` when it's imported. Python is very strict about encoding standards, but to deal with real mojibake, we have to be very loose about them. For example, this tweet requires us to recognize and simulate a broken implementation of UTF-8." - ] - }, - { - "cell_type": "markdown", - "id": "indirect-grounds", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Avoiding false positives\n", - "\n", - "ftfy only changes text that trips its mojibake-detector regex.\n", - "\n", - "Here are some examples that ftfy _could_ consider to be UTF-8 mojibake and try to \"fix\", but thankfully it doesn't:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "specific-buddy", - "metadata": { - "slideshow": { - "slide_type": "-" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Con il corpo e lo spirito ammaccato,ʏ come se nel cuore avessi un vetro conficcato.\n", - "2012Ѱ\n", - "TEM QUE SEGUIR, SDV SӅ\n", - "Join ZZAJɒs Official Fan List\n", - "(-1/2)! = ù\n", - "OK??:( `« ):\n" - ] - } - ], - "source": [ - "NEGATIVE_EXAMPLES = [\n", - " \"Con il corpo e lo spirito ammaccato,\\u00a0è come se nel cuore avessi un vetro conficcato.\",\n", - " \"2012—∞\",\n", - " \"TEM QUE SEGUIR, SDV SÓ…\",\n", - " \"Join ZZAJÉ’s Official Fan List\",\n", - " \"(-1/2)! = √π\",\n", - " \"OK??:( `¬´ ):\"\n", - "]\n", - "\n", - "for example in NEGATIVE_EXAMPLES:\n", - " # ftfy doesn't \"fix\" these because they're not broken, but we can manually try fixes\n", - " try:\n", - " print(example.encode('sloppy-windows-1252').decode('utf-8'))\n", - " except UnicodeError:\n", - " print(example.encode('macroman').decode('utf-8'))\n", - " assert ftfy.fix_encoding(example) == example" - ] - }, - { - "cell_type": "markdown", - "id": "prompt-playing", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## The \"badness\" metric\n", - "\n", - "ftfy doesn't just look for UTF-8-like patterns of characters, it also makes sure they are unlikely to be the intended text.\n", - "\n", - "- Improbable combinations: accented letters next to currency signs, math symbols next to console line art\n", - "- Lots of things involving capital à and Â, where it doesn't look like they're being used for real in a capitalized word\n", - "- Unloved characters like `¶` PILCROW SIGN, `‡` DOUBLE DAGGER, `◊` LOZENGE next to other mojibake-related characters\n", - "\n", - "...and many more cases that it looks for in a big regex.\n", - "\n", - "Strings that match the regex can be re-decoded. Specific character sequences that match the regex can be reinterpreted even if they're inconsistent with the rest of the string.\n", - "\n", - "> **Side note**: We used to try to categorize every Unicode character to find \"badness\". Now we only categorize the 400 or so characters that actually can appear in UTF-8 mojibake, because ftfy wouldn't have a reason to replace the other characters anyway." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "reserved-sheriff", - "metadata": { - "slideshow": { - "slide_type": "skip" - } - }, - "outputs": [], - "source": [ - "text = \"à perturber la réflexion des théologiens jusqu'à nos jours\"\n", - "\n", - "# We want to highlight the matches to this regular expression:\n", - "ftfy.badness.BADNESS_RE.findall(text)\n", - "\n", - "# We'll just highlight it manually:\n", - "term = blessings.Terminal()\n", - "highlighted_text = term.on_yellow(\"à \") + \"perturber la r\" + term.on_yellow(\"é\") + \"flexion des th\" + term.on_yellow(\"é\") + \"ologiens jusqu'à nos jours\"" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "developing-intervention", - "metadata": { - "slideshow": { - "slide_type": "subslide" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[43mà \u001b[mperturber la r\u001b[43mé\u001b[mflexion des th\u001b[43mé\u001b[mologiens jusqu'à nos jours\n", - "à perturber la réflexion des théologiens jusqu'à nos jours\n" - ] - } - ], - "source": [ - "# Highlighted text shows matches for the 'badness' expression.\n", - "# If we've confirmed from them that this is mojibake, and there's a consistent fix, we\n", - "# can fix even text in contexts that were too unclear for the regex, such as the final Ã.\n", - "\n", - "print(highlighted_text)\n", - "print(ftfy.fix_text(highlighted_text))" - ] - }, - { - "cell_type": "markdown", - "id": "suspended-angel", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## ftfy is a hand-tuned heuristic. Why doesn't it use machine learning?\n", - "\n", - "I don't want ftfy to have false positives. It does, but every one of them is a bug I should fix. The actual rate of false positives should be once in several gigabytes of natural text.\n", - "\n", - "- Machine learning techniques aren't designed to have error rates this low\n", - "- Machine learning would have a tendency to make its output look like what a language model \"expects\" even if the text didn't say that" - ] - }, - { - "cell_type": "markdown", - "id": "fourth-pepper", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Why does mojibake keep happening?\n", - "\n", - "The top 3 root causes:\n", - "\n", - "1. Microsoft Excel\n", - "2. Programming language APIs that let you confuse bytes and text\n", - "3. An outdated heuristic called `chardet`" - ] - }, - { - "cell_type": "markdown", - "id": "outstanding-metropolitan", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## chardet\n", - "\n", - "`chardet` is a heuristic that takes in unlabeled bytes and tries to guess what encoding they're in. It was designed as part of Netscape Navigator in 1998, then ported to Python and other languages.\n", - "\n", - "It doesn't know that the correct answer to \"what encoding is this?\" is usually \"UTF-8\", and it thinks emoji are some kind of Turkish.\n", - "\n", - "> **Side note**: I recognize that we could be looking at ftfy the same way one day, particularly if I stop updating ftfy's heuristic. But chardet is fundamentally built on assumptions that aren't true anymore, and its original developer decided long ago that he'd written enough Python for a lifetime and he was going to do something else.\n", - ">\n", - "> It was an okay idea at the time, but we should have been able to replace it in two decades." - ] - }, - { - "attachments": { - "excel-export.png": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnwAAAIrCAMAAABCuLQrAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMy1jMDExIDY2LjE0NTY2MSwgMjAxMi8wMi8wNi0xNDo1NjoyNyAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNiAoV2luZG93cykiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NjM4NjVBOEJBRUJCMTFFNTg2OEZFNjg2RjZEMDExQ0EiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NjM4NjVBOENBRUJCMTFFNTg2OEZFNjg2RjZEMDExQ0EiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo2Mzg2NUE4OUFFQkIxMUU1ODY4RkU2ODZGNkQwMTFDQSIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo2Mzg2NUE4QUFFQkIxMUU1ODY4RkU2ODZGNkQwMTFDQSIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PpugliMAAAMAUExURTqQ283NzVmO0M3e7tv//9To/OHHbo2MjP//25Zld+Xl5VvRIABmuB15wMH//9PvtM7l/FCAu7XD0bb//+vr61ad5duQOm3G+mhnav/bkNTU1I5mkDSGxSFyRj+Qbdi6Ysji/Ojo6MyrS9v/22YAOvPzxfu4lWlllWa2/8DAwLm5tZKeqe/v7sTg/Ozs7PHwq8i15/DW//Dklq7b8ObVfqGs/zBXgPz8/LZmAKurq/DT3AA6jzKx/auts19n0n+Z/oyOZt7e3pA6Olaf/fHx4pA6APHx8dDFiqCgoJC2//ProfParau1zvPz88PBvPr1sjo6kJa822Y6j6Du/+H//zkAAGo1NmZmOpC2kDoAZnqv3WCq7wA5ZYfO8Orr8zMzMmaXzpDbtmYAAPn5+enKfwAAAPX29srR1m85eABerJDb29nZ2ZfE8bOFSHmBh/b29qyki8yfaIPH6uzTjrT92VQAOb7G5LTDu5Gly9/l5mYAZh45XzpkwgIZ1gOY5m2mhurqp+Pp7QAAZtnRvQ+p9jkAOsbu7HOwtkBqlQESr5Ll/LbbkOrqycNOTgBmg1Kt19bq/PPrtFSXrSxRqQAAOqjq6dbb2QBdjfr2vEk8WNHj8+jt6s/V3uvx7eqnXvv57gABiTJ+3VYAAJ7AosjIyNDa1ODQlJBcADqOj7Dl/ISA0aSlptnf3mKe6PLy8zc9Zx4znMfOypPls5eXleTm6unu7dTn+ezftv39/d/o5X00AAA5Ou3t7fX498rm9rnVxv/++vT39cLv/8DHyVMAZf//tuvdi5Db/6ioqM/l2DQAYapgAMvk/P+2ZgAAYMfi/Mbh/NLo/P7+/unp6bb/tobb8u/Oh5TZ96vw8JA6ZvCrYZBmALGxtPDElzo6ALZmOgDPeauXzGs4Xzuv6f/u//36wfHvz/v7+8/Pz4fb5J/P/0k5eF81NdfZ5tny887w8O3r64LCYK+vr9/g4AB/3dfX16vw6pXn2NFOGtjc8DU1X83t/2mn9PDw8DOZ/////5reOOAAAEQ3SURBVHja7J0LfFT1mfcHG0PIDgImDEkaDcSIQ8REEWUwolwFuUjUIipFJGFCaSQRrKhALVK1tNtaL9G19fbiRkVhqb5Uiy7YtTbEC1M1yKUu2lYqZHdb7ZLyeV93Nz37v1/OZWYyOWeuz6PMnDn/c84kM988z//yO8/jM8DAUmQ++AjAUgtf184JYGBJtJ1dDL6unccaG1ubwMCSZK2NjccQfhi+nY1p4YQnQRzKYjN9u/NaG3di+LqOtQJ8YMn9dou3tx7r8mHHNw/gA0vut9tTPK9xJ4JvQmNvuvx4HQ3hcDhoaemo6kSPoSDbivgtbfTR9jRiAb9R8NU8uxZ0XRjxJx++5sLexgkYvtaxaQOfHUN8N2audAH6gQO+vsJXWp8XuO+r/+qzha9gaiew4f23ux318rrEy7Kesa0EvqaajIAPEWQEWtabYIkLPoxrRHGp+uGaJwXz6NvFvk7C191T00ThK04r+Apm+IxIXV7BDByBSxeENxZRVhB0BVOXri3qWFuEd6PGjqqFGz+s6uxo8HdUXbsg7DPYftGMm5hv67jqjN8W8YsPqerEl7+evhOmGszrb/dwe2HTTvFywJTitIMP9fkQa2s/XFuEKaE0RnzMUYWCHVfl/a4TOaqOBoTlPQgtHzrimzMQZw2IIbyD7adPpAlfBeEqnB6+OLogcXfkhdYM5tW3u6O6eXt6w0cpwxGy9DpMBHZhYT/f7UfEBHwohpJGxCLa31H1iY+dGAqy/aL5E59hho9cHB1Oeo80FkOnLxnf7sCJZZdNyCz4yCPbXVq/Eu3fhFBR4WtY4qdHFMwww0eaHOBDHUG0QeADzwfwCfjUsNvgk/AVzGjDsG0somH3uiLq+TpDPnIc3sH2i2Y6jSJcG+4FsrBLhiE07EKfD+Dj83zlCLtAXR7eJgMOEXbxXuTR/DQctxHw8L+CGWgEsZCPNNB+2Yya8gw5OYN7gfjiaMARQb3Lb9F3gtEuwOfhCkdU3wZdPoDP0+W1QBTnFgLHB/DB2i7AB/CBAXxgAB/AB5YT8EWVVNF1WjR4aIs6OrW5Bpuq0eQE3sq02FAGxFqZBF80VUtH1RLKYAz4rNdwhs8rmRY1mMTJIvgWkuXY+6rcg88zmRY1mL7ONPicJFUdVUvXFiFMlqJXoXDYj49Eh2BdVRGTUqnkTA5jjRU5kAmucAu5Hn7c4pFMK0hOCwbIu8PCXYb1+RwlVXipFtFYP4S/ksu/TEMl+nzoGoiQyEamSuCCq6pOdj10OOs5eiDTQqdFEHn43UGykHlh115ShXVQ9XlUyxIJh9s6peSFq1lUN0mXfsmBTHBFhFT4etgheSbT4u9MhDbQ6ctk+BRJFfZ1U6+tz8Pf8nVF6Hu1wGfqsIkDmeCKvmY9Pc9kWip84PkyDj4HSRV1ZKTrFiEBUUZmpqGywhdhkdPHJFbsejLseiDTUuCDPl+mzfM5SaqIJ0IdfNLbC3/9mk5yOPE0XEol5vl8Qm2FD+yggis24EAjEBSMt3gk0ypS4YPRbgbBl0kWh1eDLh/A55EFYro1EGsBfGAAH8AH8AF8YAAfwAfwZZCkijTyZTT7Lr77EimeLgHkUdkOX3RVS6eYSXYyDyRS/KZhmDbJdfg6osupvJBI8WeYMM4F+JwlVWTJgGhTNL3UwrZPEDMRvO7glUQK5FG50edzllSRRo6LopfyEa9EdQXeSKRwPxNEArkRdh0lVYZU5Wl6KQQGYo4GRy8kUphs6PTlHHy6pIoJoyx6KYob7fR5IZEqwI4TPF8uwBdFUqV5vgjXJyPmfnYN9UseSKToadDny4l5PkdJVUOYiZpMeilyDyQfi3ogkaKnwWg3y+FL3ELB/r59DM8GXT6ALzFy4rLoEimQRwF89l5pRhu4JYAvZWEXDOAD+MAAPjCAr4/wRZVU4QUNnxFAR4Q30vneEFnhjchHuh5seCKtAk1VtsMXtfbadWzil0/JlV5OZ4SL5OMFbE3EA2kVzLXkMnwBzpKy0IWIwJwFfPRRNHqRfQpmmXMAPidJlYAvIr0ZQo3A4qePAhEPpFWwvpYDfT5HSVVkI92pejPUEdPgC4SZe3JfWgXKgtwIuw6F/wLhNpwesnS9wdLsERbNnq/NqyKB0OnLNfjUwn9sFBsIyg4fi7NKn4+t8npQJBA8Xy7A51T4j0rlpQOi0ZAOgunj6CLvigRCny8X5vmiSKrw/J50QGTKz4dn+IIGfVSy47ourYLRbpbDlw7m5OCgywfweW8O0irQVAF8YAAfwAcG8IEBfAAfWIbDR1JehHxKiT8xxUfzfDuPEVQVlGJi/g4M4Ivh+UiJoTylxJ+Ej6xQ+CVTyplmFZQOH8yRAHxxwUdKDKkl/jT4ZIY0HT6zCkqHD2aHAb74+nwBuvoqSvzp8OG0FTg1FY7Bflb9z1YFJWrvQSkWgC9u+AhjSok/K3wGz1lgyLVgswpK1t6DIlQAX9zwhUrq8tQSfyb4pqql/MiW4SBEYf+g/B7AFy98kbozZgTVEn+WPp9MTUW34oAPPB/AFwd8BVR/p5T4M412g4ZMTUW3DAcVlAIf9PkAvjjgI4v3Ib9S4k9M77EcVQar+RcK+9mWYauCUuGD0S7AF1efLyGL7tqgywfweQhf9ARToIUC+LyEDwzgA/jAAD4wgA/gA8sg+Mh9YpbSfqqMgM698Cz0fExhTkolG/kUjfNQxSrGMmVziXmA3AvyrcyFj5QcuNzuW7VuqzstSakU+NAWuY3S1uzFWNrKsRHzAGUvTOtkLnyROidFaHT4LEmpdPicywXai7FMbMU8QNkLE9oZC59IUMBL+5UuCG9ha2QLwhu1akTqTlNSKixmoZmoJHxUf0Wvp57HxVhGh6rDwmcT3Rb6c4h+QAfIt7Klz1cwQ6Yrw4KohiDT0tN0VbzPx/QuYqealIrmtWrgC78y7JLL8uuxi6liLEWHhTepHEvxxfYHgHwri0a7pQvU6mr1eczJ0XRVJs8ndqpJqWReK5IyiNWqpPordj15MRU+kxSmqtMmqloPAPlWNk21BPy28OkCetNOc1IqnolK9suo/orDJy7mOnzg+TIWvtEYGZ/y7cqw2+Czwid2akmpeHYrLb1aRCSExNfj52lirKjwOR4A8q3sGXDQ2uFqRV054LCEXbHTlJQKh92FLF+VSJlG9Ff0evI8ZWrGyhbWbfE+n+MBHSDfyqKwa5mKu87dQKZdL6aj6mNBQejyZRd8oTp3A5l+vejV/vrchQP5VvbAh9PAb3TR8bl9PbAs93xgAB/ABwbwgQF8nsDnhjTKWUUQ9QDQQ+U6fG5Io+wVJ4aURoEeCuCzg88NaZQjfNEPgBniXIfPNWmURe5klkaBHgrgMw843JJGWeROZmkU6KEAPjN87kmjTIv+pgALeiiAzwKfe9KoROADz5fT8LknjXKAD/RQAJ8TfC5Ko3S5k400CvRQAJ8OX/zmvjQKunwAX5zmvjQK9FAAXxwG0iiAL6WeDyxL4OtKBwP4sh2+3mP8u1bh24lsQqoN4Mtu+N4q29WIv2cMW5cC37HGxtaU2ySejsA6XCDTLKFwOErtP0fVgSkPlZbcCjRUSYRv64BR88gX3dh4bKcCX1NvTfG+VNskR4D4bmdVi+GQQciah0q9HxfmVpIIX8usMVMK8fdcXNPb1KjANywdzBv4zHmotJvBYVY5efD97MJN8rvW4PufNLBJosAaUUDh6ZQgF1ap8EmtlZoySuit6I3nNmIrXhRwKTrgPvoesJ6WPPiumTr1Rz/6Ef2qhzWlI3xEukKqCGFUDC6sUuGTWislZZTIVKpmO7CIrVh1IqohQNugJEieTdTg601Xz0fDJF09U4RVHD65S02pQqoEyqKAtmIrXpeNVQ0MQqcvZZ5vV0bApypa1ILPZiXejCCVWMmigNzbRYMPPF/S7K1rrvmRhK84beFTw26Dzwyf3CVSWflYtUBZFNBebGUJu9DnS5Hn25emfb5wOSnel4e3g1xYZRpwmMIukV3h289EUUA7sZUsChgiAw70HjDaTaLniwu+Gx/ccvppp52WCvgSsqgTMFGdG3T50svzbTlNsy0ZD180sRVoqJLq+X4UCz7JHT7j2GmZDx9Yxng+xesZ+P8MgA8sW/p8p5127dunnfbqL6nnA/jAkur5Tje+853W08HzgaXA85326jOP3XZaajxfVEkVLgxDRhBtUft4Ntewy87ilUyLDWVArJVYn+87xu7TU9Tni6pq6ahaQhmMAZ/1Gs7wuS/Tgkmcfo12v3Nswump8nzR4VuIv+7IfVWphi+qTIsaTF8n5vle/SUJuynzfE6Sqo6qpWuLkE9ZKnJVkUMQkxuL8IJGUCeVVg4M0cIeQnRFpVYsyZU83A2ZlprzChbuEhztGqej/1PX53OUVOHcoYjG+iH8lVz+xekzeIlodg1aOZC5JSG64nqsoOg5uinTkueDZCExz/ePyO098VJK+3z2kipai03RTkl9C9kKBTU3KUoYtSmiK3o9lmdIwueOTEv9B52+xEa7KVzhiCqpIolrr63PE9opC3xa2O00NJGVEF3RAYMXMi3tH3i+fq1wpGBtN6qkirohnyG0UzIyk7DL2NDg40mtuOiK574yh113ZFrKP+jzJTDaJYIWYmknqSLeCPXypXaKHMLjJp+AIdfwMQjogVJ0xaRWLMmVNuDot0xLPR9GuxbPd+Gmk4bzr9pRUrUo0yRVXljiMi2Y53OAb9Omk/4pJnypFJNmCHyxcmKBWMve8wF8bsAHBp4PDDwfwAcGng8MPJ9Hkio8QeIzAlgDxbKShsgyakQ+0vVgw2WpFOijcsLzRVW10ElfZZ6t9HI6zVskHy8wzTXHgi8uqRTMmyTD852UUosFX4BXo1SWrhAXeDo34KOPSqObUimYMXYDvguHDTvppCjwpfbHiyGpEvBFZBhEqBFi/PTRMFfqdUUqBWtlbny7A4+X9e5Ma/iiSaoiG+lONQqi7pgGXyDsV52bO1IpUAm4Ah9OCJ72ns8xS1Ug3Ia1zKXrmUcj/skwe7429zNaQacvN+HTslRR2VQgKDt8LM4qfT5NWeVSRivwfLkDn2OWKtwVVNwQ7d7RQTB9HF3kRUYr6PPlBnxRJVUNZFwg3VCAzuVFyNxehB8dNA04+i+VgtFuTsDnuSUklYIuX9bB12y21MPnIJUCfVRK4NvZ2tTU1LrT/nJRG+OAL/08H1j6wNe168SJKc1Tmk8UH7Ne7FhxM2qacuLErq7E4Zs9btwebOPH3/IAFP4D+KQ1NfdsJxuthWWXma91WVlhK9nY3tPclCh8XePmF8zek//oZ599Nv8WgA/gEzavrEZsbz8xVr/U2BPbxXZN2bxEPd+4+fP/5V/yZ9Z+9uij+QAfwMetsUytwtt6QnNvTSdalVc1ZY1xvf/hwzbwzTfyK8Y/Or+WwBdFCRXwa416500VCvDMBWwAe0+npdcnJFImBZUYYICEKsXwTemhPm87dXE1zUrXrqu5RmvsmRIXey0th23g+5eKmctqZ89mns88IuCvySyI0ugMX4xBBrn1XFyTK6i0C8DUSorha+0m3qywu5sFXyXOoijMXF53dyHxkt2tcbHX0nLcDr6KfDTwiAFfwGe4Bh+fNFYUVPoFYFI5tfCNmqKzZ9Qo3m2K2MnomzIq5psfbyF22AxfbQGCb9ye8flzFPiwbuBbWNX0z2zxgTojkdBRJJ4y6MFbeP4puiCmaaiWKrIpfiiZTFYUVB28FiB9P1hOSy18hC/EXs88FFu3ow7f9jIxo9eFB79NeP/2HkqfSqbRQ8305i0tRstk/KjCt2d+7ewzKyrzx48fn58/UcDHlFBseZXsXmsqg8W0UYbIfSHEV6yRa6hU2RRPk8GupiqoeFEiEBKkHr6u7u2EPWYIlAndYrLvWPcExI5oRPRt7+4y02d+8xbCn2GCrxYF3MqKCoTe9Arp+ZjIiQlLzPCpiacMkfVHqKf0xCnslZ6divlRE3zy/aDTl3r4auKDr8YMn2HDHnN+pgHHeDzLXFlZkY/hOy7hoyIne/i0xFMKfFw9ZQ+flp3K3vPJ9wPPl+Kwu4926WomYDPwrLLga2cZHnyQhhraKdxnGu7asmcHH2JvPIGv4gnF8zEllBJ2lT6flnhKCbtcPWULn56dSvb5RBTXwi70+dJhwFETe8BRE+eAwx6+PWikgeCrqKz4ntLnI1GS1eYLKWondouHkniKOkg+4LAPu4psih2qjHb5e4bU94PRbmrhY7Mn8U21sHmZvsOXv2c8hg+5voofV0Rd4XDTGSnzfFHbwVI1yVzYl0nmQiNB+NBIA8OXj5wf9XyOFnDPGwmJlMM1QUKVavgalaVdvNCrLa/pL4vLjiUI3/j8ChRyiefL3wBruwCfBGyf2K6xCgskmvsSFxYYT21Aju/ZAvrj/Q0se62PkqrG5ik1eJw7obfHite8sp5e0lg8pbnRSBg+7W8DviGAT3btdjWXTWmeUnaieIKVnQnFJ0hj89h+iEkBPoDPUUbf1djU1NToQFfUxgThO+UC/oDt4q9dYP0tTpH79pPzzrP7XUegvQenWXbvPw8wyBT4vDTrDUQW+GxNbzUfy15ffP+0EX88eP552iF44+oLgAOAz2Y85CJ82PFdbJz7Nwt8F4PrA/hiwHfKv5NwSnyVgSC6+Gv4ET+dV3QBeXWpAht5ffV5f7v4j19jURg7uF9NuxofdAraJdsuvmEagADwRYcPYYL4QltX09B5LukAoqcRxgV4N/dg/AT0+pRLT7mUu7lTLtWdoWxTW8AAPjv4LmBPF3/tUuLa0OP+c1FP7m9knxxmkFBKX5Mw6wCfbINOH8DnAN/VF1DSHOC7gcF3qU4WfR0ffOD5AD4H+Pb/cRqZDhHwKWE371IRds/TBxj0tRp2r1YGGVrYJc4TDOCzm2RGw4s/TlPhwwMGOuAgDsw4D3fw8rSwS18jSkf8cdp+2jDiPAnffrUNRrsAnyN8Lpmjg4MuX9rC15RKc3V5bYS9h4MVjtTBt3XAqHm5AR8YwNcn+ED0ltV6PoAPDOAD+AC+dIMPZ5s3Qr6OBnZ7Gbs5nFhE1PuzNy0NvcPNG46FAh0zVhl6XiuwLPZ8oSC+p6yjaglNhKHAh2/SLb08MfjYrmiFAs01//Sz4X62XIAPMRDCaQMWkhpr91V1aiU4olps+IwohQLNNf9MZ8OdvLnQ5wss8eOveylComDqUgU+npGAFe5buLGIJp4iOatY+ikSRXn+KpGXauOHDWFeks2pUKBNzT8l0VUn5DDICfh4lioSf4co8OH+GfKJDSJZhqzXh9NSkTwYBCSSoopWqqQlA3nhQNLrcygUaK35pyW6guwtqYHvjdHr1q0b7khM65z/OJgYa61nfjoYP37joApfqKSO5v3B8bdDhQ/7sqBSuE+ksgq3dYpMVT5R7IrlpVILB0YpFGhXdk2r1wadvlTAN3pdZ9P0Qb6n3Pd0Z36jJPhGkw5fpO6MGbQ61dRr8cBDgw+NSGXhPlkZEIHB008hTxbk8FkLB0YpFBgLPvB8KYBv99FBwaamp+bUF7nO3u4zv3Ht+s91z4fxYNXsI7S3JeAbjanzycJ9rBPIc1bR9FMYKKXEn6lwYJRCgXY1/1T4oM+XCs83p/7e3buR6zt45jdaXjw05yfrHkSwrFy3bjBiZweKx28geFatw69R24tF83BVom3rqG2L5fkOriTnN5FTV05iWVJC1L/9togyxDLQ06LgsnAfK+rH6vfR9FNkKpBjQ/NS0Uea6ipaoUC7mn8KfDDaTT58ra1z1n/e1NR4dNCIMz8dvvuzT4O7V607SLzWQdRlm/fxi0Voq7X1zKvw8xujWd/wTcLemzE930HxPzp1dDwrHJ6Vr4ru2qDLlxrP9x844E4ftPhRRAgisfHMb3S24jFIJ0KG8dPUtDLYiLwX8n9vSPrejN3nO9g0ejB6pKfOSSl80bNgQd6qlHi+6bjPh4a0xMPN2fQ55g31AEkcPthK4ds9ejBykfWHmt7YzU98MzZ7+Eq7pw/6Hr4ePnV6auEDSz/P1zoadeRQyN2O/dSZnwYbV714cPRDn8/5D+LyWNi8qqi1CYdl5bxt8Yx2Dza9gbxeJz31DRAWAHymab6ncIwNEj/ViuMtGnCcOWjdQyWK50Pjj3WIRxQ8B/dttIsCNoa5lZ4K8AF8ZvqQtba2shc4vP5qN9mnBmcUNXfvRnt27+4LfbvpyfTyu3cDfABfDGJWPvS5d8tripbKUrHPbJEwlxqQ6ZigU/8wwKVYEVY1MOJnm7wFTzNby17R0207mcpOkxKLt4AGy3X4pg9at+7Fojd2ewef1FLFHHo0yCId9qMQtjvEsOxgdQ7wggidpuYtEaJM8MnJvvjhMyuxxEIezM24Dd9T81C89VJYoGip4hn3xgXf7zolYpiu0vXkCf3jLXiOL9Cy3kRMXPCZlVhSAQaTM+6HXW9VLSYtFZNIUfkU+oqp8Jhs41C7hUmlWHkOrITKE4dsZPVj8A6f4AQDGGTwiRYEHXpL5BjXFrE1D/yeH+LlFb8i0MKM8eZO2kGwUWJ1sp8A1uMyDz5VS0VEKGR5FsdY1AlkkimyrXq+BrKshpdk2Sm6kIpuMPgwYZGNyoEkMndclfc7fHl+cSLZ+uYMRcPF9rP3/uaMIAvl0umxJWHq80CJkInwKVoqKUkhW6Eg6ZixbWvYxeGPaVk0IRXr/jP4iAQwEG5b6JMDg4gfS/t8AZ9+8U+YQAu/M/sBePMnPsOCGBPDkB8SOn0ZCZ+ipbLARyRT8cCnCqn4mIP2+QQUdLRBH0vrV6JTN03t1OBroKJqouEywUeaHOAjPyR4voyET9FSydBIwh0hKuBj2xb41LDboNUlp0GaSanWM1LIPA2bzymYgYvwNmwsMvSLh3wG13Cx/aI55NMGtaoSizhZ6PNlJHxcS0WYC1NxKA6kbZ3s7km6TeMhHXDgeb5yWrePnCKKALLGIKMabQSCdB8BTd43SW7Z9BuGcnFCPxpBLOQjjbZOpRk15Rm2Siz6Q8JoN+Pg89zQwMLV60HtQPfg25dKy8jlNagdGC98LbPGTClM3/x88A1ltedL7+SQ8A0BfAAfGMAHBvCpNqG5rH3AgAHdzV0ewYdXRuvy6KQKmUtjcxa4Y8/a4h/aKutrukLLotdSM1E556sCpVQK4Sssa+/5yy5sU9rLCr2Ar3QBnlO7nKakItzxNYg83pYYfM6N/BWfHYmSrwpmUFIHX3v14UmHhVW3iy+xYIY/FCY3xSq6TuyrIlisSZWbISYhYTvRazprTHfiVnzTOF2v5+sPCEGZRIq3eQWf8LHO+apAKZU6+ErGnBDbZ5x4s0R8bSGxmqXJLAtm3IdFSnRZ63KeG4DsFKfQnaR1El+U5etjGDeZRIq3CVmTkkZKbLKkVQbPWyXEVeR/5YylDeyWcrpkskVkJVBUUjTQB/HD9Zh8nB0BVs1SB9+fmZX8XckkAR+daDXBJ5LfBbhEmMcsvJOuNOCvksuJC6ZOUhYMSEoqfFpISJd4m1A3yTRSclO4NJZAQ4iriARPTTzF14YjPp5rgzpZVRFAV4pFJrUQ6AVSCN9hDt9XJv/dTya1MPgm00GADh8lTYOPf3FEUXdfAxo7oIPEt9mxVvF8rPdXWn8GBbaDelDaoKtaFI7YAi6R4Iu8VUxcpR/GM7DQTFYsy5D8kcXPwARaPirCR38w0OlLFXwtk1TPd5x5vgbWP6LwNbCbcqQbEcpNPlQkO9sO4tfo2w+KgaTS5+ODTiyv4qfwtqjw0aRVCnxcXGUPH4WLwWf1fJx3rJEicj/wfCmDb0eJHG7849aSWYy5a+/ptHo+5Zukyk0KoIAPj0b8MuziVpIijShILqcpqfCpPulHWZtJUqVTRZNWKWG3wRcFPp7JioZd2ecTA2D+M2ON1M8+LTJAKZW60e7AkpIW8f/AN/l3pPS17OAj8yVKvJLwiQyNpHUSH01QYZRfnY8jcZu26aonE3wsaRXBmA847MMukWPRHFbsUGW0y/+YiOqKaqTQiB5Gu6mdZEb2+9+fbp6yIHI4pz4fnTmJKIOStUWUkyCeUQn5+H0P0VY4kuBxlHm+mMeApQ98RqguT/b55M1ilDOi3KS58QR82Of4yW3bfjJexq1Rl9cCnrscoYKK8laglEoZfMuXL2//PbLTf/97tLk8+sF991WwtgvwOdsdyH5PDW/GeLc++yqAD+Bzturq6ttOp4Y2q13/8eAbAvhS9uPBNwTwpRA+OV6J3+gAmwkbHOJ/rKKByoUMFxVYIMHKKPhiJghSbtc1MWM+l79GI5+YRQMt8Dk38lfxKLBgaibX4Qv4YhcNTAC+uBRYMCmdefCpqaJ0tRSOy/dZ1VASD7z7Wzhh1D8voIskRCKlFg1kqislnRV7y34rsJSqhB9WTSY/MyzHuQbfLbcsQ5afn1+R/wvv+nxBNVWUWS3VUXXQsKqhBHxst8xcQae01aKBhrJsK4sHuqHAUqsSNvjJzwxCBNfgOzDulgPjnn30s88+m5/voeez6lakWkqtLyTVUAI+UQ1Qpi1dyzwbLxrISgUq6awMlxRYSlVC/g86fe55vvnzH9hTMfOBzx59oCLp8FG1lIBPU0NJ+K5j6ZfN8MmigaxUoJLOynBJgaVUJRT/wPO55vnmI6uYOfPR+bVewmcVTUm1lNihqaFk2KW7lbCLfY9WNJBeR01nZbikwFKrEvK/EejzuQlfLYKvdvbsStnRiTo1F3vkaD/g0EVTiloqxOr4aWoobcARvo8mrKIprPBwVCsayK6jpLOivrL/CiylKqHoKsBo1z34amtnI/jGjRtXqc19NPjdgc8LE87H3dJZNgos6xtAl889+JbNxz6vouLB8vLyB9VPXLn3Iu3gEwoHd+GzUWBZ3wAkWC7CN3v27HEIvsFHjgTL1U8c319BJ+dCNOyUru+gM118lBjk82t0lozOsiUHvhjMZ9Ab5Dh8OODeXDn47rtHmOGTCeMjS3wke62fT5HRFt4pJ911u74QCAsAvmjwjRu3Z09lZcXgi+8eMViDj81dhIKoC/S7a/Edj7KvzrN50/k1PqHmA/gAvr7Al4/YG19ZafV8iC+GWMHUpVed8du/JwnldfjY/BqfmQskPeyCZTZ8iL3xlZs2IfiOlA8uV0a7QZEwPoCDbolPnSKjLRFRTYXOzAV8VviiCqOUqoDaaNaxQiAv8RJD/AS6pwyBb/z4/MrJm47cffedqwdfKeb52joNma8dsYD/KVNktIXNr9FZMjuB3SS7TrsijIpZFdDhXBvxkyZggfmQzIBv/LL8/MrKyf4jgj03f7xo8AV8sasCOp1rFT/p6imYCc4E+CqwVa5cOdm/evDZhofw2QujZFVAVRZF3Otklp7NbyglAkXRPrP4iQiz5OVhDSwT4Hvq6C+eraj87nTjSvf9ngqfvTBKqQpo6LIoOa1jKhFYlyd1TWYJgLw8rP5nAnwe/3hy8GwrjFKqAqqyKLkMay4RqOuaLOIncXno9AF8k9SZG9JXs8DHqwKqsihb+MhUjq5rcoYPPB/Ap4RdW2GUUhVQk0Vp8GklAjVdkyZ+0sIu9PkAPn3AYSOMklUBVVmUon5isideItCka1LTT4XUy8NoF+CLusLRD+8UK/0UdPkAvhjLa4mnqYqVfgp0TxkB34nqicfLUgQfWG7DV3a4+eyyiWOqqw9XHz58uAzgA0sefNXNnw98+ZHDx+irwwAfWPLgm3jXY2effcfhCVdgMyaysaOsORSHRRP7AnwAn7Mdf/zVn776+MRjhYU3vf5510SJUyRKCij7xCoO8MUtqZKqGHn3XIRtyKKBbJQBOqosgK97Ytmry7dORJ5v8/PBnVslTtGQ6hN88UqqlLRT4u65SButn0DbQEeVbaPd7q2ztra/NaHwy3//VacxS/F8tEABKRgg0u+oUhMliU6nNVEPu5uoD5IqJe0UX4+jNWKoaJX4QdBRZRd8yN7aiuytt7ZeIeCjdFEhipp+R5GaGDKJTpVNoh7GQR8kVSLTgLyBia6p4YQatA10VNkH39bXP7gf2eFCY4f48kmgo0l0lPQ7yoKroSXRsSTqYXcT9UVSRQsDavBR2GYEWRvoqLLQ833lx8HVq0+uHqPAR6pDKYl3hNrEBB9NomOXqCegh93YkiqSdoon05N3z7GuHC4aCDqqbIOvuXnMS6dv+c6V91cf6xnQrg44eIVcmX7HBJ9IomObqAd3zfogqWLDWNs+H2sDHVXWwbfjRFl394D29jfHjNlRLb/SAAKK9KSU9Duq1MQwZBIda6IeNm/SB0kVLwxoyLvnaC5GdCJrAx1VtsHXvWPHjq3V7QPaq7eijb68dxypJPogqeJjaEO5e47M8wXFCAh0VNkG35gdO2YdHjOgfcxEROEOL+Bzsr46KtBRZR18xye+tXXWrFmYvIEDByYTvr5KqkBHlYWjXS9/PPiGAD6ADwzgAwP4AD6w3IAPiwzq8hRZFJ8y8SxJlXYMWA7DhxfGjNLLqSyKcBcKshkTF5NUwcQLwGeFj66RGaxSH0aQrYK5m6TKajDlDPBxoQFfKQ4yKtxNUsXOxA/XY9rxajMstgF8nAEqi0LOikZdl5NU0TPJQfgBZ9AHmQHAJ+t54N5fKc4srsDnVpIqLib0kd0dV+VBpw/gE30+PjjFCioFPreSVKkywoAv4AOBFcBnsBFB6eVcMhVgd7K5nKSKy7GICvZnnxaBwArgo9EWz/NxyZSYxHM5SRU5k8oIC2b4YbQL8EVb4fAwSRXM8wF8MZbXvEtSpR0DBvCBAXwAHxjABwbwAXxg2QxffJKqfiepIqMM0FEBfCp88UmqEkpSBfMrAF9U+OKUVCWUpMpqMLMM8CnwxSmpSiRJFeioAL4Y8MUpqUokSRXoqAC+uDyfEUNSlUiSKtBRAXxx9fn4ONVBUsWabft8rA10VABfX0e78UmqEkpSBTqqzIRvVFl3WVk3sgEDBnhcgSguSVViSapAR5WR8JXNQrb1+PHD1dUDqr2Fz8HcTlIF83wZA1/3rO9//9vfvv3SG//ppPbDKYHP7SRV2jFg6e35MHxzb5879/YBx+kXR6u1sM6/kjVAhEG2TuYWfGA5Cd+JMQPKZr2I4HvhUuT72o8bygSbNeseyQ9OynKQkQPAB9Yf+Cb+pKV7Vv33v//9S15A/w2YGBs+MrQM9v3Hg28I4DPZP/7df3ZPqkeu76FLLrlkfftWHT5eV4hVEeLwKbPFAB9YwvAdf2Zi98QXX0S9vvUPrf/28lkafDQpAL6b0a+F3USm0CZJgRReghBaKSKncqoHGGOmOI5svEaUi6vDE3uRFv5DuwdPQNb9Kyi0PICvG8H3f5Chbt+3v98+yxx26b2LrIoQ+SY3FhkJej4ukOoQNRAMJqdiwikrURI+O9Digi/mMWqlQJNIi9XzagjCzI1X8OHp5fb2K9rbX3KCj1UREnnJEuvz8cUyDT42wSyBTgl8DiItUUwO5qw9gq9nzJjq6uqJb23dOmtWuxN8dDVB5CULJzLa5RoVFT6+tEaAxump6BpGiC1ykEO1LFW4G7CQ1bLkmarI0fQ4soqyBWcyWEDl0Hy+aAGtUrNw44f4pGAgzE9kV7aKtOiFOqqWNuBjYbXOg9HuGDLGaK0eOGuU2RtI+LiwXTBD18n6AR+vYM5EBdybmmoJ0u6hzFLFCl76RCUumqlKy2aFYmSgrZMeyvt8QZbfCp9KTkI0yRNlvSytnpa4EBTU8gi+MTvw1N6o9vbqibOaPf3xDFGST/V8Er6pLD0VK6JGJ7PZoTxRkCh4qVUflOdRYRZ5otUFRdEjmmJIZBcyn2gn0qIX4vBBp899+Aa24Gov1WMOz9oxa1Yy4DP1+QR8aD9NT8XYEjSYs1QZFvjkeQp8ehfSHj55op1IS4cPPJ/78CXvx5MCKbs+H94fERWj6ZYFPlnwUlaTFmmtRIFMGi0bfOYBKy2LqcInT7SrJKiHXejzZTh8bGXYMtpl+1l6qlDYz7ZU+HiWKhp2RQ4rmdZKCaVswKGEXZbfygSffEM7kRa9EIcPRruZDZ+teeFR+j4PGVOkBV2+bISvHxmqHC1U11egY4q0QKGVlfC5bHgCcCOMDQC+VMAHlpHw3XLLMmT5+fkV+b8A+MCSCt+BcbccGPfso5999tn8fIAPLLmeb/78B/ZUzHzgs0cfqPAGPjU7lVyzpzebkb5+xLa/pmsDmMxJb5R76GunPFZi+Q6SWKWZ55uPrGLmzEfn13oDH00xhR/5ugKdwPPxWQ4lP1VU+Jwb+WunPFYCPpg6ST/4ahF8tbNnV4ovyyrFjEfHZA8fSzGFYIv4pVqJT24EfEbEfookAfic8lhJ4QJMGqcZfLW1sxF848aNq1S/SqnzVL7dPiM4iWenwjlaZgTFZSYz4Mg6Gz6ggGaZGiKVT6LGH5NFddJgHVYkhqJAINe3WCVSspCgcgBYmsC3bD72eRUV4/bsqXhWJU1fMEgcPp5+KtRC1+ypZ+UJqfDCPTmAZplSlE94k+ihqCyK8CPSY0j4uPTqmzNs81hF/BxS5QCwdIFv9uzZ4ypbWm48e+6pp849VX612BcRbaZwHg33KRVIuWOKZ7RL7rkkcZBDcy0bKjAY0AE0y5SyBEsODQWlMsWqWlHLZH3iM2wlUqxcoHYAWNrAN27cnop1JQ8hW7/pdg0+qs1kzuObLGoKhSbXoMQx1YJ7eMp9mVhaQse3nBN0AMkyZYKPZ4iMDV/DEr9h5/l4uUDtALB0gQ+xt6dyU8uwYZuH+U5S4ZvKtJnMeXziU0R1LKVeKBgbPp5+SoeP3IVOPRHPQfWzazTxCe4KYj2UkEXZh11BashnJ5EyWLlA7QCwNIEvH7E3vvLCszav33zSScNv1/p8Svo76jlM8MW+pWiSSDFlCE0Uy8hBdQBMW0XShPsN3fMtJIMLIYsSWakM5ToSvoIZqLtokUjxcoH6AWBpAh9ib3zlzzat37x5+LR/ul0Z7QYNoc0kziPkM5QKpMwxxRd2nU1xRaFgfz8MkEhlHHzjx+dXXjhs8+bN06b9z5Vinq+NJqNFTok7D+w5ZAVS7pj6CZ9UM7kQEUEilWnwjV+Wn1858P//Zwu22+N+4/hmXeJe2y2Y0QZOKefgq8BWeeGD4xeElywJt6QKPrBchO+po794tqJyx+2/rJ579tkD5wJ8YMmDz+MfD74hgC+F8Ik0BQ6eM2TKFcQbTPPBVpWUdTgBgimAzwSffXzmu8nccmz4bFRSMJEC8PUTPlq7JSZ8Niop6zVhChngs4EPr4tF6vKoIkrWuKKLeHiCT2aZ2iLh4ymoNJUUvdQQckM5SVMl00/B4lmGwUduIMr38AYimlqyY+2Ha4tkuouISHpfn0c1fSzdBXkS8FG9lMkNkkspmiqZfgpkAxkGn7yB6IF87zwfjZm8Th/POsXCacDP8/PIZFMbi5ScGaYYHKHiUi1NlZZpFCxT4Js/v3ZPxc3VOFXk4e6kwHedTPpHhsKINB0+GXajwaelqaL/wPNlHHzz51d897vkxUTP4FPDboNMd0ru38DKQfuwK+DTEkmxsKulqaKCG+jzZR58tRU333wFMg5ffLnc+zLPV46wC9Dif0El6xSbmqO3T26RyaYs8CkqqW+SSw0xp6kicRhGu5kXdmdXVlePvOn1z7vekvMYphuIYpr9jIprKxzxODXo8mUafMtqZ8+ejTzf5ZufD+7cqoDUx4xjHsMXT1orEExlIHzjxlXOrP7Vv/+q05ilgMRuIMITu/wWIqoZVubV6ISdyBvPb2z0Aj6wbIQP30BUifp8Y8ZcYYaP3SukZaNVb25kaeJF3nhrnwvgA/iiwUduIKquLizsKdThm9rJbtfQ8nCrmbXFhJ3IG+8D+AC+vt5AxEa7O/Q+Xyz4zPt5ySKAD+Drww1EM6nn26GOdoP8XiEadnEfMKKtKMgJO5kMJeCznWpxlFRFcOcxQGea6aiB9CYj8lHtSPZTfgWaq3SDbzy+gcjk+ZQbiNp4EniMwn1V+qTuAhl2cXZ3a66zWKoWVqhAUlt6OZ1HLpKPF4iBd3/lVzAXk37w5edXXnHF2LG7dm13vSBMLPjI0q5PxwchgkcuAR99VBr7Lb+CWej0go/eQDR58s07bt6xY4BX8DlJqgR8ERkREUkEHz99lMz0W34F629pBl8juYHou5d59OPFkFRFNtKdakBEPTMNvkDYL8Jp/+RXoDxIM/g8/vFiSqrCbQvxdPV6NmVN87qYPV+bO/Ir6PQBfNdpdVRJmoxA0FDHBKY+H82k0X/5FXi+3ITPQVJlUFWV9Ei0e0cHwfRxdJFr8ivo8+UcfNEkVWgH7qdJj0Sm/Hw4SpJ8kmGxvuyG/ApGu7kGX/INklRlCnzZWIEIklRlCHxQgQgsdZ7P8wpEYACfk+fzugIRGMAXDb44KhCpZrda67CCC/ABfFHhi6sCUeLwMYkMB1oB26R8kheKhOv+FdRPOQAfr0D0YHl5+YMqSc43EPURPlr3gAMtDzQrn8SF8PQxzIrkBHykAlFFxeAjR4LlKknyBqIQXR4tXS8LnfGpXyYWIfoS6+1DAj4uNpXyPcOqfJLw4UNgPjgn4MMB9+bKwXffPcIMH7uBCHEQWYKlJvJeIdYiPVWgrdMOFxN8rKoRMbU8n1Lkb0vV0gZMOKyE5QJ85AaiyorBF989YrAGn7yBqD7vd9fWnzFVqbjCWqhYROhLfDHCLgWaqeo15ZMs8hegSn3QAOQCfPQGokqr55M3EBVMXYo81N/X51ngo2IRoS8J2IZdoucTQAvPp8HHivzRC/FD4dvNevjIDUSbNiH4jpQPLldGu+IGIiOAg26JT4mgrCWiOizDcvuQKqmy9PnM8OEifwp84PlyAj58A9HkTUfuvvvO1YOtFYjwM1Ej3aPVg6QtXCxC9CXW24fM8BGgLXfz6EX+eNiFPl8mwDervXBXvysQVU72HxHsufnjSfgY0PpoV4xvF3IN1RYY7WYOfC0Ivpp+30C0cuVk/+rBZxsewOdoim+zzhJCly8XPB+tQPTd6caV7vu9GMtrUvlkhQ/UT5kCX02mi0njq6UFlo5hdxcomcHA8wF84PkAPjDwfJPYHY8hH59q4bOB/Uo6bivGYiMV0GJljufzvAIRXgIupUtzgbo8vGBSernRv6Tj9mIsajBHk0Gez/MKRBgVJsqid28rICWYdNxejMUMZqczx/PRCkTPPvDoo6dWeNTnCyzxG3pCSSPxpOOGbRoqoshiObBgXS5zPJ+4gejHN3sEHxUK0LCLCQmq8PU16bhhl4aKKrKozwNFQkZ5PnYD0ZHh4htlfTGeCoUZVY/2Hb5QCYKOKavwZRYEjcSTjhsOaajwNYigEDp9GeX5amdXzpw5e/aPj1wp+lZLaCKVtk4XPF+k7owZQXURgyyrJZ503B4+HMGJoBA8X+Z4PlqBqKJiz54j3ztyP4dvIckWel+VC/BhTyR1fKM7DSXCJpZ03JqGiiqy6FAE+nxp5Pl6U2mTmEQg5FeUVX6+kUjScT7apfDxNFQL2ckosMNoN7meryyd4fPAzM5N1SVAly/Jni/X4DOnoVLhAy0WeD5v4TMZKLLA86UMPjDwfAAfeD6ADww8H4NPu6XSukoSiSqtEoIsseaiFLOyuVfTrisYrUvoVCOQjF1AnZXpno+KqCwA8B2RNj7tbG9ckCXWXMSVhDorAficxFnaoTBrk/Gej4qonOAjq2LRpFVckCXWXLQCHnENgqPA51wjkLwDTNtkuOcjC2RkYQMRtPFDsgwbDn+PrXRQ7EgOF5GoaqNSxU8IspauLUKHLVXg4ytv5Dx87SJaQMuStk27NHn80FmcRU65ntYphJW6jO/zEREVXcol6V7k2q7BFyuItIrmfVElVNIZoQcSf4co8LEL8wRXPlZR0LCkbSMKLaHOUsoO2tYIpKfgF6BRyIbRbumCoK1QRfd8TBalSKhop48IskittVCwQ4VPXNggDayioDVtm6bO0soO2pZpI8IsEouh05cN83wBvxN8os/HE1WpEiras2OCrIKp15I7QbRuGb0wyzkp6gqa0rZp76u9u70yMEBECkHQpWa+56MiKgU+LeyS6EpiJk9U5dPgUwRZEZqRV8DHLszOY2GdjENMadt0dZZadtCuRiA9hYRd6PNluudjIqqQDKZUPcXkUWSeL0g0hORZk1BpgqyC3xZRhtjEIL0wO49qU/G51rRtmjqLPTqKs8gp36J1CmG0myMrHImKA/onKojq26DLlwaer5YULT1wz+LaqsVZBp9jjUDDAHVWWng+wVwWwgeW7p7v1sWmDRAWgCXV86FHvHFgQfg3h8jLySgW9/a+Gw6vQFReuyC8KEB2kAM+qvsC4APP51afbxGFD/P3UTne+fSK3o8YhotrG+ru/QiRh3bQAwA+8Hxue75bFx/ACwQr2E4CWjjctpi/QP/EARK+KKKp/uSq8kBRpZ8Ikqq06fOhR8zWO4ckkfT10amYSmWHWc8XRTTVj1xVXiiq9FaYakmzPl9tQ7kGH4qvbA6G/LuVHaCE3aiiqX7kqvJCUWUaQMMkcxp4PiXs4vEEDbu3EtaOvhb++p8WM/B62YgkvEKBL6poqh+5qtxVVBn8J1AraMLyWuo9Xz+VzNFEU/3IVeW2oor+BFoFTRAWpN7z9V9G7yia6keuKsNdRZWssCl1VdDpy3TPF1U01Y9cVXSw6pqiSocPEl5lh+eLKprqR64qtxVV9CfQKmhCny/TPV9U0VQ/clW5q6jidTS1Cpow2s14z2eZwXDTPFRUQZcvGzxf+sIXVVEFkirwfJ7CB5bNng8smy29Pd9wbCcN27TpQrAstB1vDUhn+O5H9vrjc88Gy0LbNLH6xKg0h+/+ryxfcypY9tmwl04UFp9Ib/heeeX1r4Bln71/UuGo4rFpDh/GDywLbfjLY8emP3xgWWnD8VecxvCdDJa95hV8H4XpTWr9h++VIbeOoA/UZt4zwvxLKK0nn/xcGNtmm9/1lZUr7t921vPm3fc/txk4yCb48J0YB67vyw3lzp4PkXU/fXA0vdV8LH89s/75lf9w1nObP1AOwRtvX7N4KJCQPfBxGXy/4fvg5KHYrZGHDxxMbyWvzK1445WVm1+5f1V4mrKLnnn/qhUfgCXJhpbIR4/gY/cJ4TvCj85YRGDkd4vT28SPvhYOL6K39P4DPYCU+zbfND4c/Yjc8w25tQWFU/zy5LfR2dM+mLkAP+KnLWgfetr4A/L7URTx6//32uYPVv0zvikE7X77mhGvDLnq4U9/cPIHQ55GbyvasEsEKpJlJSWvswev4MNwLWKe7aMVCLhF4m5xuZN7PnbA0+JGto/Kn+TwDR26F7FEHoY8veKDVRt/gLbefm0z3v/0tKGoA4iefrqybQT2ZavQbnbCUPp6yK//769/QF6jPb/+wdC9e3/6ygf0kL2yDbeAJcfeX1hSMrKkpARvezfaPbBgEbsjfO2h2qu+kDeDk50HFpSLe3rpASQAv7tIv2l8+NChIuxiTujTzHd+sHfoUPL43LSZLz6Pd88k55Hfj5xAX+9FYZa+1hHDu2Qb8olARbIMk7fwJ6/v9RS+3sAKekd4b6A8UN4rbgZnO3sDxDUS5MgBeOvoa4v0m8aH7x06EqGxd+TMdzoRJ+9L+PbuJY8Ivvrn3xf7kA09B7s10jp071AEGH2Ndv+aHcEO4W1DR1KfCJYUG/n6T0pK/ptsegTf6NUIpHJ6R3jvgf/69OFecbc424mR4/Ad+K8/If/3NCFUv2l8OP4Zn6t7fu9zK/Yyz/ddEnbf24vDLkYMP+Gw+/Rm/uvROE1ei9CKtjHEvJ1cS7Qh5wlQJM8+KDnnPS/h29DARhVf/xP2ZytIGKbhlO4U84Dv4p3vrsAYLiRjEHLckyp8eHhR9zx2UudQbs7BA4Zp56DI2jbivfdWhfGAA79CYRYZcWsjyWs0qBi5su7550jDyJUUT9w+8jm1bdUKICKJ9t57bCM9VjjeXeQw6ULgG/n+yL0jR+59/3209f5I9oT2vMf+gFDTOSPRr8R+J3wE2odbR7733kh09kjagBwc2cI7RiptxCWOBCiSbmkB34H6L5zgG+mevfeVlSvs9r//3IqRYKmwNIDv6Gtti53Slg5/z017/337/QhMsBRYugsL3gfLXgP4wAA+B/jOActeS3f4wLLZ0hs+sKw3gA8M4AMD+AA+MIAPDOADAwP4wAA+MDCADwzgAwMD+MAAPjCw9IRv1Y366zMHnR/7pOmX3Emez/Xvt7T4/WTn2Bf8/r9+kTkf+qoSbPGkFPnDneJ33a88AXx9tjktA8/XwIsTvs37KWmdlpb9veci6C5aP6K396LVmfSxx/Wbo19q80n7o8GX4SwmE75V1z8p/9qPoo//aHzw3XgEfw93XrLfBj70b/oLIzLuY48TvnOPnHsE4HPDpg88Z84LnzOX98tBJSU3njnoNhJ95qAgdBPaueOse1eirY+HPaZ9wP92ySEUWf8NfdB/8PvvxDT6D6FAO+IxCt9F6w9lKHwo/N5IfvvvDTp/uvU3x7/0RZtRd+Ki9f4j6HdlTzga48+B9jvuZC9kI8BnY6UvfP7YSuru0GePP/4HBg17+eOz7j1z0E29c76K9lzf+zHuFX487HP9r/sPI1AA+oJ8tGORp7vkYuQHZdgt3PxFxsHHfT75IG7qrTjrfJvfHP/S01/Af2AjcI+XPTEu+afRyz+UETa9YoCPWwXycTNv5J093udD/+Z89V7kBW7Cr+ZYu+HYt23+4g8jMGoX4RHG2Ev+Df2dH2EDjkO9mev5PkYe//w5LZ9jFm1+814cc8+9EzPIPgUeZy9iIy3ykrxQGgE+q/1wEB7inXUv6+wxB0g+97MEfAhRFIDN8E1/YfXme0mAvXr7C/vpZ3yufwT7sC/LwD4f8Xy/+Oq90weez7sidr+5H9uhQgpWoeCLfg4MPvqiEOCLZqQ/M33lTej/3o/PUj0fCbs4+p5PhyUfD3vZ1Km+yH+E/O3/9b8vWs/gQ26Bf9gX+TNztIuC7Bz821+Jw67Nb34Rnj7a/sIIS9ilnwP/bNiHAmE36liXTfWhYPMd9NmvKrmRj3jxgINuobazvrD0+Xp3vXAIP//lBf9fT9rP4+8h8ZeOOtsZNc/HPB8aY+DJp4/JgMPmN/8DGeniuOunAw4/G1PQz4GOPO5kL0QjwAfWtxlQ3Ovtt2Vc5xfgSwdbpY9yE7Q//PULgA+sL4Yn+M7qt+MjK4zP9wJ8YGAAHxjABwYG8IEBfGBgAB8YwAcG8AF8YAAfGMAH8IEBfGAAHxgYwAcG8IGBAXxgAB8YGMAHBvCBgQF8YOlqAB8YwAcG8AF8YAAfGMAHBgbwgQF8YGAAHxjABwYG8IEBfGBgAB8YwAcGBvCBAXxg/bS/eWL6e0z0xAC+zIcvCRed6MV7AHxZAN9YD8wMnxfvAfABfJkK32Xz5jXN234ZAADwJRu+7dvv+nLNmjVf3gD4AXxJhm/eDRi8YzcgAG8A+lII3y4PzAyfF+/RD/i237Xmy51dyIyuL4G+hOzdMLZyfWdt1WJ34XsyHP7NIc/gqzn6Wjhc94Vj+4aqxe7Dt/2GNV8e27mzayf613XXmhsApUTMhrS+w1cT1Za98/CuZdff8JeaPpkZPufLL1j0l5pl1ztefsOtq3c5NPUDvrl/3r179xvEmpq+XAMgpSl8T9bdu+uGXTU1nsC364evLUJPzmgjz7e6xm34LrtrTe8TL7/8GLGXX355zV3yg0OPB+5ZbP4gaxuUEMMa+vxBZx18t+IP4MCCcHgRf0K7cCRbZDl2CTUrfMVRbUNDOX5CIX4FIqW4GMG4bEH4Nw9HP8sMn9NxyK/SDXz94g1Vk9F3XFz8Q/wLFJO3QZ6vxuHchOGb9+c1haNeveOObZds6+npueOOx/+swWfzV6yBBvApn0Ntw6Je9NfKn6oWf7TC9uDwkiXhJb19hW8fImERfkY+6MkVxfveXYQ2ip8sdwW+fcvq7+Wb6LIbnl5R/ORvHv7ha+XF+/bRt8GPrsO35vGebcjWr1/fvO2RR7Y9vkb+IdPHXm1L3+YvtH05C9+Bdw6hwcci9oTjxoJy26Pt/F5M+LB7WrBoHxp2tK3esPbhDVe9jTwSdlNueb5RZINev2o1Zo66Q/o2yPMV73MZvu1ztz2z7ZG36y+sr3/okUee2bb8VM3z4Q0URbbQp98ckl6OxRZTe23Vwt98yzbY5CJ8vb0B+09iSW8i8O37eWAF4uGHU1ffEShfWX4XZmPfPnfgI5GcQoiuj0jbh/4x+MjbeOL5Tl3+5rZH1lF75plH3lwj+nXoTwDDh6NIgGz1flTO2xbJ2KK11z5d3vvkihzt85HP5J1D/IlEg0B5/AOOfVHto9Uo8JY/Wff2sntWj1r2X58+PGrD0+X7YpkZPscDnwwvGrVv2fX0+szzofdDLfRtMI8Olrjne/zUN18qK3scWVnZM91vLn9c93y3Lj5Q/0XvBhxCsPMVbfzP29ROPWF5boZd7PzbFvMn3OcL02ARJ3yjotkNG55Gn+8o1PH7+p9WF/7wtRVoHw6IK6KeNcoMn/OR+GJ1r9PrI9JGkX/oPRfRt0EsOp2Z+Gj353Nfum3Mm8RuG/NS9ak/V/p8JOweqL+XjHvfOaQN7GTHRm2nX0Ig58KuKysc0TEatW8f2ygUD7EtfvjIRfty6f7D1zt27ptlL92GrPq2MbfdtnzuWMXz3aqG1YZy9Y+cxxZTO+sQBsqBJdfhS8j6Bl9i1o/ltVFzl9+x7cRL2J55/NSfjzWH3V4UPLbcSgYUStjlscXUzl73IdiA5TB8Y3t75i7f9cQTL7/8xGOPz+3pHQsUpAq+Qg/MDJ8X79EP+MaO/fkk5PyeeHX5qZN+jl4CBQBfsuDDuO3qWT4XAdizC9gD+JIKHwYOub9d6D9gD+BLMnwSOWAvpfDl5q2TQhANBIC5YnD3GhjABwbwAXxgybAxYwA+MIAPDMIuwAeWJLcHng8M4AODsAvwgSXN7YHnAwP4wCDsAnxg6QMfGJi35gwfGFhSDOADA/jAchC+YoAPLFXw7QL4wFIFX6+ErwbgA0sqfE3dPTUUvtaxw8DAkmmNZT1jWwl8jb2Fzd3tY8DAkmTt3c2FvY0Yvp2N84p7msu6B4CBJcW6y5p7iuc17kTwdR1r3V48qqdnChhYUqynp7B4e+uxLgQfcn2t83rH1hSDgSXFasb2zmtt3Glg+Lp2HmtsbG0CA0uStTY2HtvZZfyvAAMApxtkZsLd6OQAAAAASUVORK5CYII=" - } - }, - "cell_type": "markdown", - "id": "reduced-needle", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Microsoft Excel and the Three Decades of Backward Compatibility\n", - "\n", - "- Excel gives you lots of text formats you can export in, and _every one of them is wrong in 2021_\n", - "- Even the one that says \"Unicode text\" is not what you want\n", - "- The format of \"CSV\" depends on your OS, language, and region, like pre-Unicode days, and it will almost certainly mojibake your text\n", - "\n", - "I know you probably have to use Excel sometimes, but my recommendation is to make CSVs with LibreOffice or Google Spreadsheets.\n", - "\n", - "![excel-export.png](attachment:excel-export.png)" - ] - }, - { - "cell_type": "markdown", - "id": "legendary-catholic", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## How to avoid mojibake\n", - "\n", - "The bad news is that English speakers can go for a long time without noticing mojibake.\n", - "\n", - "The good news is that emoji are *everywhere*, people expect them to work, and they quickly reveal if you're doing Unicode right.\n", - "\n", - "- Use lots of emoji! 👍💚🌠\n", - "- Use non-ASCII text, such as emoji, in your code, your UIs, your test cases\n", - "- This is similar to how Web frameworks used to pass around parameters like `&snowman=☃` and `&utf8=✔` to make sure browsers used UTF-8 and detect if they came out wrong\n", - "\n", - "> **Side note**: Services such as Slack and Discord don't use Unicode for their emoji. They use ASCII strings like `:green-heart:` and turn them into images. These won't help you test anything. I recommend getting emoji for your test cases by copy-pasting them from emojipedia.org.\n" - ] - }, - { - "cell_type": "markdown", - "id": "sticky-reminder", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Thanks!\n", - "\n", - "- ftfy documentation: https://ftfy.readthedocs.org\n", - "- My e-mail address: rspeer@arborelia.net\n", - "- I'm @r_speer on Twitter\n", - "- I'll link from my Twitter to the notebook version of this talk\n", - "- BTW, I'm on the job market right now\n", - "\n", - "Fonts I used for code in the presentation:\n", - "\n", - "- Input: Fantasque Sans Mono\n", - "- Output: Fira Code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "included-california", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Slideshow", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notes/mysteries.txt b/notes/mysteries.txt deleted file mode 100644 index 23e4a9ed..00000000 --- a/notes/mysteries.txt +++ /dev/null @@ -1,10 +0,0 @@ -on https://www.nipette.com/article-6358031.html, a comment is signed 'MÃ\x83©Ã\x82¬Ã\x82¡nie'. -This happens to be triple-UTF-8 for 'M鬡nie', but that's probably not the name they meant. - -What exactly did https://www.horoskopy-horoskop.cz/clanek/431-numerologicky-vyznam-jmena-jaromir -mean when they said 'TadeÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂáÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂá' ? - -https://mtlurb.com/tags/arbres/ -'montrã©al' probably isn't in cp850, but what is it? - - diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 130dec28..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,53 +0,0 @@ -[project] -name = "ftfy" -version = "6.3.1" -description = "Fixes mojibake and other problems with Unicode, after the fact" -authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net" }] -license = { text = "Apache-2.0" } -readme = "README.md" -dependencies = ["wcwidth"] -requires-python = ">=3.9" - -[project.scripts] -ftfy = "ftfy.cli:main" - -[project.urls] -Homepage = "https://ftfy.readthedocs.io/en/latest/" -Documentation = "https://ftfy.readthedocs.io/en/latest/" -Repository = "https://github.com/rspeer/python-ftfy" -Issues = "https://github.com/rspeer/python-ftfy/issues/" -Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md" -Blog = "https://posts.arborelia.net" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.sdist] -exclude = ["^.github/", "scripts/", ".readthedocs.yaml", "notes/", "notebook/"] - -[tool.uv] -dev-dependencies = [ - "Sphinx >=7, <8", - "furo >= 2024.7.18", - "pytest >= 8.3.2, < 9", - "ruff", -] - -[tool.ruff] -exclude = ["badness.py", "notebook"] -line-length = 100 -target-version = "py39" - -[tool.ruff.lint] -select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH", "FURB"] -ignore = [ - "ANN101", - "ANN401", - "RUF001", # complains about Unicode characters that belong in my docstrings - "RUF002", # complains about Unicode characters that belong in my docstrings - "PIE808", # explicitly starting ranges at 0 sometimes helps with readability -] - -[tool.ruff.lint.per-file-ignores] -"tests/*" = ["ANN"] diff --git a/scripts/char_data_table.py b/scripts/char_data_table.py deleted file mode 100644 index d063d1ac..00000000 --- a/scripts/char_data_table.py +++ /dev/null @@ -1,78 +0,0 @@ -""" -Used to regenerate character tables in ftfy/chardata.py with explanatory comments. -""" - -import unicodedata -from dataclasses import dataclass - -from ftfy.chardata import UTF8_CLUES - - -@dataclass -class CharData: - name: str - codept: int - encodings: list[tuple[str, int]] - - def sort_key(self) -> tuple[int, str, int]: - if self.name.startswith("LATIN "): - return (0, self.name, self.codept) - return (1, "", self.codept) - - -SAFE_ENCODINGS = [ - "latin-1", - "windows-1252", - "windows-1251", - "windows-1250", - "windows-1253", - "windows-1254", - "windows-1257", -] - - -def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None: - char_data: list[CharData] = [] - for char in chars: - name = unicodedata.name(char, "") - codept = ord(char) - encodings: list[tuple[str, int]] = [] - for encoding in SAFE_ENCODINGS: - try: - encoded: bytes = char.encode(encoding) - byte: int = encoded[0] - encodings.append((encoding, byte)) - except UnicodeEncodeError: - pass - if encodings: - char_data.append(CharData(name=name, codept=codept, encodings=encodings)) - else: - print(f"No relevant encoding for {codept=}, {name=}") - char_data.sort(key=CharData.sort_key) - for cd in char_data: - encoding_info: list[str] = [] - for encoding, byte in cd.encodings: - if byte_min <= byte <= byte_max: - info_str = f"{encoding}:{byte:X}" - encoding_info.append(info_str) - encoding_explanation = encoding_info[0] if encoding_info else "???" - print(f' "\\N{{{cd.name}}}" # {encoding_explanation}') - - -def run() -> None: - print("# utf8_first_of_2") - show_char_table(UTF8_CLUES["utf8_first_of_2"], 0xC2, 0xDF) - print("# utf8_first_of_3") - show_char_table(UTF8_CLUES["utf8_first_of_3"], 0xE0, 0xEF) - print("# utf8_first_of_4") - show_char_table(UTF8_CLUES["utf8_first_of_4"], 0xF0, 0xF3) - print("# utf8_continuation") - print(r' "\x80-\xbf"') - show_char_table(UTF8_CLUES["utf8_continuation"][3:], 0x80, 0xBF) - print("# utf8_continuation_strict") - print(r' "\x80-\xbf"') - show_char_table(UTF8_CLUES["utf8_continuation_strict"][3:], 0x80, 0xBF) - - -if __name__ == "__main__": - run() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..1bd30526 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[aliases] +test=pytest + +[flake8] +max-line-length=100 diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..af9af78b --- /dev/null +++ b/setup.py @@ -0,0 +1,60 @@ +import sys + +# Before we get to the rest of setup, with dependencies on setuptools and the +# Python 3 standard library, let's make sure we're not on Python 2 and provide +# a helpful message if we are. + +PY2_MESSAGE = "Python 2 is no longer supported. Please upgrade." + + +if sys.version_info[0] < 3: + print(PY2_MESSAGE) + readable_version = sys.version.split(' ')[0] + print("The version of Python you're running is: %s" % readable_version) + print("Python is running from: %r" % sys.executable) + sys.exit(1) + + +from setuptools import setup + +DESCRIPTION = open('README.md', encoding='utf-8').read() + +setup( + name="ftfy", + version='6.0.1', + maintainer='Robyn Speer', + maintainer_email='rspeer@luminoso.com', + license="MIT", + url='http://github.com/LuminosoInsight/python-ftfy', + platforms=["any"], + description="Fixes some problems with Unicode text after the fact", + long_description=DESCRIPTION, + long_description_content_type='text/markdown', + packages=['ftfy', 'ftfy.bad_codecs'], + install_requires=['wcwidth'], + tests_require=['pytest'], + python_requires='>=3.6', + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Filters", + "Development Status :: 5 - Production/Stable" + ], + entry_points={ + 'console_scripts': [ + 'ftfy = ftfy.cli:main' + ] + }, + extras_require={ + "docs": ["furo", "sphinx"] + }, + project_urls={ + 'Documentation': 'http://ftfy.readthedocs.io', + } +) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/face.txt b/tests/face.txt deleted file mode 100644 index f77116b4..00000000 --- a/tests/face.txt +++ /dev/null @@ -1 +0,0 @@ -â”’(⌣˛⌣)┎ diff --git a/tests/halibote.txt b/tests/halibote.txt new file mode 100644 index 00000000..0070a07a --- /dev/null +++ b/tests/halibote.txt @@ -0,0 +1 @@ +【更新】《哈利波特》石堧卜才新婚娶初戀今痠逝 diff --git a/tests/test-cases/README.md b/tests/test-cases/README.md deleted file mode 100644 index 673bd5f2..00000000 --- a/tests/test-cases/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# ftfy test cases - -This directory contains JSON files with test cases for ftfy. Many of them are real mojibake found in the wild, such as by listening to the Twitter firehose (when that existed), searching through the OSCAR web crawl, or in issue reports from users. - -Cases labeled "synthetic" were not found in the wild, but were instead constructed to test a particular edge case. - -Cases labeled "negative" are not mojibake but look lke they could be. We're testing that ftfy does not alter the text (except for its usual processing such as un-curling quotes). - -`known-failures.json` contains cases that we would do better at with an improved heuristic. Most of these are false negatives, where ftfy does not figure out how to fix the text. ftfy aims to have no false positives, but there is one synthetic false positive in `known-failures.json`. - -## Structure of a test case - -A test case contains the following fields: - -- `label`: A description of the test case, shown when pytest runs in verbose mode. -- `comment`: Further details on the test case because JSON doesn't have comments. -- `original`: The text to run through ftfy. -- `fixed-encoding` (optional): the expected result of `ftfy.fix_encoding(original)`. If unspecified, uses the value from `fixed`. -- `fixed`: the expected result of `ftfy.fix_text(original)`. -- `expect`: "pass" for test cases that should pass, or "fail" for known failures. \ No newline at end of file diff --git a/tests/test-cases/in-the-wild.json b/tests/test-cases/in-the-wild.json deleted file mode 100644 index b40c838c..00000000 --- a/tests/test-cases/in-the-wild.json +++ /dev/null @@ -1,451 +0,0 @@ -[ - { - "label": "Low-codepoint emoji", - "comment": "From the ancient era before widespread emoji support on Twitter", - "original": "He's Justinâ\u009d¤", - "fixed": "He's Justin❤", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mix-up about smurfs", - "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", - "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", - "expect": "pass" - }, - { - "label": "Checkmark that almost looks okay as mojibake", - "original": "✔ No problems", - "fixed": "✔ No problems", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 Russian mixup about futbol", - "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", - "fixed": "дороге Из-под #футбол", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German", - "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", - "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", - "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup of the replacement character", - "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", - "expect": "pass" - }, - { - "label": "CESU-8 / Windows-1252 emoji", - "original": "Hi guys í ½í¸\u008d", - "fixed": "Hi guys 😍", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 emoji", - "original": "hihi RT username: â\u0098ºí ½í¸\u0098", - "fixed": "hihi RT username: ☺😘", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Turkish", - "original": "Beta Haber: Hırsızı Büyü Korkuttu", - "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", - "original": "İstanbul", - "fixed": "İstanbul", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", - "original": "RUF MICH ZURÜCK", - "fixed": "RUF MICH ZURÜCK", - "expect": "pass" - }, - { - "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", - "original": "RÄ«ga", - "fixed": "Rīga", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixed up twice in Russian", - "original": "приятности. РІСњВ¤", - "fixed": "приятности. ❤", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in Malay", - "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", - "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", - "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", - "original": "Iggy Pop (né Jim Osterberg)", - "fixed": "Iggy Pop (né Jim Osterberg)", - "expect": "pass" - }, - { - "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", - "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", - "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", - "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", - "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", - "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixed up three times", - "original": "The Mona Lisa doesn’t have eyebrows.", - "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", - "fixed": "The Mona Lisa doesn't have eyebrows.", - "expect": "pass" - }, - { - "label": "UTF-8 / Codepage 437 mixup in Russian", - "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", - "fixed": "#правильноепитание", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in French", - "original": "Hôtel de Police", - "fixed": "Hôtel de Police", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in French", - "original": "Liège Avenue de l'HĂ´pital", - "fixed": "Liège Avenue de l'Hôpital", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Vietnamese", - "original": "Tại sao giá hạt sầu riêng lại lên giá?", - "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", - "expect": "pass" - }, - { - "label": "Science! Mid-word Greek letter gets fixed correctly", - "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", - "expect": "pass" - }, - { - "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", - "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", - "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", - "expect": "pass" - }, - { - "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", - "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", - "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in English", - "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", - "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", - "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", - "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", - "expect": "pass" - }, - { - "label": "UTF-8 / ISO-8859-2 mixup in Czech", - "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", - "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", - "fixed": "Mám dost třetího tisíciletí", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", - "comment": "A difficult test case that can depend on the order that steps are applied", - "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", - "expect": "pass" - }, - { - "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", - "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", - "fixed": "vedere înceţoşată", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1250 mixup in Slovak", - "original": "NapĂ\u00adšte nám !", - "fixed": "Napíšte nám !", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Spanish", - "original": "DOS AÑOS", - "fixed": "DOS AÑOS", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", - "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", - "expect": "pass" - }, - { - "label": "fancy Unicode crossing-out, but mojibaked", - "original": "hotel $49 $̶6̶3̶ updated 2018", - "fixed": "hotel $49 $̶6̶3̶ updated 2018", - "expect": "pass" - }, - { - "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", - "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", - "fixed": "┒(⌣˛⌣)┎", - "expect": "pass" - }, - { - "label": "We can mostly decode the face above when we lose the character U+009D", - "original": "ââ€�’(⌣˛⌣)ââ€�Ž", - "fixed": "�(⌣˛⌣)�", - "expect": "pass" - }, - { - "label": "Lossy decoding can have plain ASCII question marks, as well", - "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", - "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", - "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", - "expect": "pass" - }, - { - "label": "CESU-8 / Latin-1 mixup over several emoji", - "comment": "You tried", - "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", - "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", - "expect": "pass" - }, - { - "label": "An absolutely hopeless garble", - "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", - "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", - "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", - "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", - "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", - "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", - "expect": "pass" - }, - { - "label": "Inconsistent mojibake in Portuguese", - "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", - "expect": "pass" - }, - { - "label": "Handle Afrikaans 'n character", - "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", - "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", - "expect": "pass" - }, - { - "label": "Handle Croatian single-codepoint digraphs", - "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", - "expect": "pass" - }, - { - "label": "A with an acute accent, in isolation", - "original": "Nicolás", - "fixed": "Nicolás", - "expect": "pass" - }, - { - "label": "sharp S, in isolation, via MacRoman encoding", - "comment": "regression reported in issue #186", - "original": "wei√ü", - "fixed": "weiß", - "expect": "pass" - }, - { - "label": "French example containing non-breaking spaces", - "original": "ART TRIP Ã\u00a0 l'office de tourisme", - "fixed": "ART TRIP à l'office de tourisme", - "expect": "pass" - }, - { - "label": "English example in UTF-8 / Windows-1251 with a ligature", - "original": "This is signiп¬Ѓcantly lower than the respective share", - "fixed-encoding": "This is significantly lower than the respective share", - "fixed": "This is significantly lower than the respective share", - "expect": "pass" - }, - { - "label": "'à' remains its own word, even if spaces after it get coalesced into one", - "original": "à perturber la réflexion des théologiens jusqu'à nos jours", - "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", - "expect": "pass" - }, - { - "label": "Fix 'à' in inconsistent mojibake", - "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", - "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", - "expect": "pass" - }, - { - "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", - "original": "com especial atenção à s crianças", - "fixed": "com especial atenção às crianças", - "expect": "pass" - }, - { - "label": "This is why we require a space after the 's' in 'às'", - "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", - "expect": "pass" - }, - { - "label": "We can fix 'à' in windows-1251 sometimes as well", - "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", - "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", - "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", - "expect": "pass" - }, - { - "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", - "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", - "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", - "expect": "pass" - }, - { - "label": "A complex, lossy pile-up of mojibake in Portuguese", - "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", - "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", - "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", - "fixed": "Cànan nan Gàidheal", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixup in tweet spam", - "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", - "expect": "pass" - }, - { - "label": "UTF-8 / Windows-1251 mixup", - "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", - "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", - "expect": "pass" - }, - { - "label": "Dutch example with ë", - "comment": "from issue reported by MicroJackson", - "original": "ongeëvenaard", - "fixed-encoding": "ongeëvenaard", - "fixed": "ongeëvenaard", - "expect": "pass" - }, - { - "label": "HTML entity on top of UTF-8 / Latin-1", - "original": "10μs", - "fixed-encoding": "10μs", - "fixed": "10μs", - "expect": "pass" - }, - { - "label": "Three layers of UTF-8 / MacRoman mixup in French", - "comment": "You're welcome", - "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", - "fixed": "Merci de télécharger le plug-in Flash Player 8", - "expect": "pass" - }, - { - "label": "UTF-8 / MacRoman mixup in French", - "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", - "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", - "expect": "pass" - }, - { - "label": "Italian UTF-8 / MacRoman example with ò", - "original": "Le Vigne di Zam√≤", - "fixed": "Le Vigne di Zamò", - "expect": "pass" - }, - { - "label": "Punctuation pile-up should actually be musical notes", - "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", - "expect": "pass" - }, - { - "label": "Latvian UTF-8 / Windows-1257 mojibake", - "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", - "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", - "expect": "pass" - }, - { - "label": "Latvian UTF-8 / MacRoman mojibake", - "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", - "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1257 mojibake", - "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", - "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", - "expect": "pass" - }, - { - "label": "Lithuanian UTF-8 / Windows-1250 mojibake", - "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", - "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", - "expect": "pass" - }, - { - "label": "Hebrew UTF-8 / Windows-1252 mojibake", - "comment": "reported by SuperIRabbit as issue #158", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Wide comma in UTF-8 / Windows-1252", - "original": "Ningbo,China", - "fixed-encoding": "Ningbo,China", - "fixed": "Ningbo,China", - "expect": "pass" - } -] \ No newline at end of file diff --git a/tests/test-cases/known-failures.json b/tests/test-cases/known-failures.json deleted file mode 100644 index 2663d9f7..00000000 --- a/tests/test-cases/known-failures.json +++ /dev/null @@ -1,70 +0,0 @@ -[ - { - "label": "Misleading mix-up in Spanish", - "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", - "original": "tiene demora y está \u0093próximo a resolverse\u0094", - "fixed": "tiene demora y está \"próximo a resolverse\"", - "expect": "fail" - }, - { - "label": "Two levels of inconsistent mojibake", - "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", - "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", - "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", - "expect": "fail" - }, - { - "label": "A-with-grave in Vietnamese", - "comment": "Currently adds extra spaces that shouldn't be there", - "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", - "fixed": "Xem clip hài, phim hài mới hay nhất", - "expect": "fail" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish", - "comment": "Requires something like encoding detection", - "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", - "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", - "expect": "fail" - }, - { - "label": "subtle UTF-8 / codepage 437 mixup in Spanish", - "original": "┬┐que diferencia hay?", - "fixed": "¿que diferencia hay?", - "expect": "fail" - }, - { - "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", - "comment": "Requires something like encoding detection", - "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", - "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", - "expect": "fail" - }, - { - "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", - "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", - "fixed": "faites attention à bien vous renseigner avant sur le médicament", - "expect": "fail" - }, - { - "label": "Italian UTF-8 / MacRoman mojibake that looks like math", - "comment": "False negative: 'pi√π' is a bit too reasonable to fix", - "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", - "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", - "expect": "fail" - }, - { - "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", - "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", - "original": "أكثر من Ù Ù Ù¡ بلد", - "fixed": "أكثر من ٠٠١ بلد", - "expect": "fail" - }, - { - "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", - "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", - "original": "MISUTÂ\u00a0AJIKKO", - "fixed": "MISUTÂ\u00a0AJIKKO", - "expect": "fail" - } -] \ No newline at end of file diff --git a/tests/test-cases/language-names.json b/tests/test-cases/language-names.json deleted file mode 100644 index cdb82418..00000000 --- a/tests/test-cases/language-names.json +++ /dev/null @@ -1,127 +0,0 @@ -[ - { - "label": "Messy language names: Czech", - "comment": "This and several following examples came from the same language selector", - "original": "ÄŒeÅ¡tina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Messy language names: Gaelic", - "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", - "original": "GÃ\u00a0idhlig", - "fixed": "Gàidhlig", - "expect": "pass" - }, - { - "label": "Messy language names: Lithuanian", - "original": "Lietuvių", - "fixed": "Lietuvių", - "expect": "pass" - }, - { - "label": "Messy language names: Slovak", - "original": "SlovenÄ�ina", - "fixed": "Sloven�ina", - "expect": "pass" - }, - { - "label": "Messy language names: Vietnamese", - "original": "Tiếng Việt", - "fixed": "Tiếng Việt", - "expect": "pass" - }, - { - "label": "Messy language names: Greek", - "original": "Ελληνικά", - "fixed": "Ελληνικά", - "expect": "pass" - }, - { - "label": "Messy language names: Bulgarian", - "original": "българÑ�ки език", - "fixed": "българ�ки език", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Serbian [Cyrillic]", - "original": "CрпÑ�ки [ћирилицом]", - "fixed": "Cрп�ки [ћирилицом]", - "expect": "pass" - }, - { - "label": "Messy language names: Hebrew", - "original": "עברית", - "fixed": "עברית", - "expect": "pass" - }, - { - "label": "Messy language names: Russian", - "original": "РуÑ�Ñ�кий", - "fixed": "Ру��кий", - "expect": "pass" - }, - { - "label": "Messy language names: Hindi", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "हिनà¥�दी", - "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", - "expect": "pass" - }, - { - "label": "Messy language names: Tamil", - "comment": "My terminal has difficulty rendering the mostly-fixed text", - "original": "தமிழà¯�", - "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", - "expect": "pass" - }, - { - "label": "Messy language names: Thai", - "original": "ภาษาไทย", - "fixed": "ภาษาไทย", - "expect": "pass" - }, - { - "label": "Messy language names: Simplified Chinese", - "original": "简体ä¸\u00adæ–‡", - "fixed": "简体中文", - "expect": "pass" - }, - { - "label": "Messy language names: Traditional Chinese", - "original": "æ\u00ad£é«”ä¸\u00adæ–‡", - "fixed": "正體中文", - "expect": "pass" - }, - { - "label": "Messy language names: Japanese", - "original": "日本語", - "fixed": "日本語", - "expect": "pass" - }, - { - "label": "Messy language names: Korean", - "original": "한êµ\u00adì–´", - "fixed": "한국어", - "expect": "pass" - }, - { - "label": "Messy language name in cp437: Czech", - "comment": "A synthetic example, I suppose, but goes with the other language name tests", - "original": "─îe┼ítina", - "fixed": "Čeština", - "expect": "pass" - }, - { - "label": "Messy language name in cp437: Vietnamese", - "original": "Tiß║┐ng Viß╗çt", - "fixed": "Tiếng Việt", - "expect": "pass" - } -] \ No newline at end of file diff --git a/tests/test-cases/negative.json b/tests/test-cases/negative.json deleted file mode 100644 index dc1e36b1..00000000 --- a/tests/test-cases/negative.json +++ /dev/null @@ -1,216 +0,0 @@ -[ - { - "label": "Negative: Using diaereses as quotation marks in Greek", - "comment": "Examples in this file might be detected as mojibake-like, but should not be changed", - "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", - "expect": "pass" - }, - { - "label": "Negative: Don't fix a multiplication symbol in quotes", - "original": "higher values (“+” and “×” curves) in the superficial region", - "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", - "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", - "expect": "pass" - }, - { - "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", - "comment": "issue #202", - "original": "Bremer/Mccoy – DrÃ¥ber", - "fixed": "Bremer/Mccoy – DrÃ¥ber", - "expect": "pass" - }, - { - "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", - "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", - "expect": "pass" - }, - { - "label": "Negative: multiplication sign and ellipsis", - "comment": "Should not turn into a dot below", - "original": "4288×…", - "fixed": "4288×…", - "expect": "pass" - }, - { - "label": "Negative: accents are sometimes used as quotes", - "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", - "original": "``toda produzida pronta pra assa aí´´", - "fixed": "``toda produzida pronta pra assa aí´´", - "expect": "pass" - }, - { - "label": "Negative: 'Õ' followed by an ellipsis", - "comment": "Should not turn into the Armenian letter Յ", - "original": "HUHLL Õ…", - "fixed": "HUHLL Õ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ê' followed by an ellipsis", - "comment": "Should not turn into a squat reversed esh", - "original": "RETWEET SE VOCÊ…", - "fixed": "RETWEET SE VOCÊ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by an ellipsis", - "comment": "Should not turn into 'MARQUɅ'", - "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", - "expect": "pass" - }, - { - "label": "Negative: 'Ó' followed by an ellipsis", - "comment": "Should not turn into 'SӅ'", - "original": "TEM QUE SEGUIR, SDV SÓ…", - "fixed": "TEM QUE SEGUIR, SDV SÓ…", - "expect": "pass" - }, - { - "label": "Negative: 'É' followed by a curly apostrophe", - "comment": "Should not turn into 'ZZAJɒs'", - "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", - "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", - "expect": "pass" - }, - { - "label": "Negative: 'é' preceded by curly apostrophe", - "comment": "Should not turn into 'LՎpisode'", - "original": "L’épisode 8 est trop fou ouahh", - "fixed-encoding": "L’épisode 8 est trop fou ouahh", - "fixed": "L'épisode 8 est trop fou ouahh", - "expect": "pass" - }, - { - "label": "Negative: three raised eyebrows or something?", - "comment": "Should not turn into private use character U+F659", - "original": "Ôôô VIDA MINHA", - "fixed": "Ôôô VIDA MINHA", - "expect": "pass" - }, - { - "label": "Negative: copyright sign preceded by non-breaking space", - "comment": "Should not turn into 'ʩ'", - "original": "[x]\u00a0©", - "fixed": "[x]\u00a0©", - "expect": "pass" - }, - { - "label": "Negative: en dash and infinity sign", - "comment": "Should not turn into '2012Ѱ'", - "original": "2012—∞", - "fixed": "2012—∞", - "expect": "pass" - }, - { - "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", - "original": "SENSЕ - Oleg Tsedryk", - "fixed": "SENSЕ - Oleg Tsedryk", - "expect": "pass" - }, - { - "label": "Negative: angry face", - "comment": "The face should not turn into '`«'", - "original": "OK??:( `¬´ ):", - "fixed": "OK??:( `¬´ ):", - "expect": "pass" - }, - { - "label": "Negative, synthetic: face with glasses and a raised eyebrow", - "original": "( o¬ô )", - "fixed": "( o¬ô )", - "expect": "pass" - }, - { - "label": "Negative: triangle and degree sign", - "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", - "original": "∆°", - "fixed": "∆°", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with inverted question mark", - "comment": "Former false positive - it should not turn into 'QUEM ɿ'", - "original": "ESSE CARA AI QUEM É¿", - "fixed": "ESSE CARA AI QUEM É¿", - "expect": "pass" - }, - { - "label": "Negative: Portuguese with acute accents as quotation marks", - "comment": "Former false positive - the end should not turn into a superscript H", - "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", - "expect": "pass" - }, - { - "label": "Negative: Finnish Ä followed by a non-breaking space", - "comment": "Former false positive - should not become a G with a dot", - "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", - "expect": "pass" - }, - { - "label": "Negative: multiplying by currency", - "comment": "Former false positive - should not become the Hebrew letter 'final pe'", - "original": "Offering 5×£35 pin ups", - "fixed": "Offering 5×£35 pin ups", - "expect": "pass" - }, - { - "label": "Negative: registered chocolate brand name", - "comment": "Former false positive - should not become the IPA letter 'lezh'", - "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", - "expect": "pass" - }, - { - "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", - "comment": "Should not become a cedilla", - "original": "Connect with Āø on Facebook", - "fixed": "Connect with Āø on Facebook", - "expect": "pass" - }, - { - "label": "Mostly negative: we only need to fix C1 control characters", - "comment": "We should not decode 'é\u0085 ' as '酠'", - "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", - "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", - "expect": "pass" - }, - { - "label": "Negative: We don't fix à in all contexts", - "original": "C O N C L U S à O", - "fixed": "C O N C L U S à O", - "expect": "pass" - }, - { - "label": "Negative: Two concatenated strings", - "comment": "Should not turn into 'fratarak᧠141'", - "original": "Oborzos, per. Vahbarz, frataraká§ 141", - "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", - "expect": "pass" - }, - { - "label": "Negative: Indonesian leetspeak", - "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", - "expect": "pass" - }, - { - "label": "Negative: math in Unicode", - "comment": "This isn't mojibake, it's an actual equation", - "original": "(-1/2)! = √π", - "fixed": "(-1/2)! = √π", - "expect": "pass" - }, - { - "label": "Negative: Leet line-art", - "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", - "original": "├┤a┼┐a┼┐a┼┐a┼┐a", - "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", - "expect": "pass" - } -] \ No newline at end of file diff --git a/tests/test-cases/synthetic.json b/tests/test-cases/synthetic.json deleted file mode 100644 index a9393111..00000000 --- a/tests/test-cases/synthetic.json +++ /dev/null @@ -1,208 +0,0 @@ -[ - { - "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", - "comment": "Examples in this file were made up to test something, instead of found in the wild", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", - "original": "voilà le travail", - "fixed": "voilà le travail", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", - "original": "בהודעה", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", - "original": "◊ë◊î◊ï◊ì◊¢◊î", - "fixed": "בהודעה", - "expect": "pass" - }, - { - "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", - "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", - "original": "×\u0090×\u0091×\u0091×\u0090", - "fixed": "אבבא", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", - "original": "رسالة", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", - "original": "ÿ±ÿ≥ÿߟÑÿ©", - "fixed": "رسالة", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", - "comment": "The original example of why ftfy needs heuristics", - "original": "I'm not such a fan of Charlotte Brontë…”", - "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", - "fixed": "I'm not such a fan of Charlotte Brontë…\"", - "expect": "pass" - }, - { - "label": "Synthetic, negative: hypothetical Swedish product name", - "comment": "This used to be a constructed example of a false positive, until you added another symbol", - "original": "AHÅ™, the new sofa from IKEA", - "fixed": "AHÅ™, the new sofa from IKEA", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Ukrainian capital letters", - "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", - "original": "ВІКІ is Ukrainian for WIKI", - "fixed": "ВІКІ is Ukrainian for WIKI", - "expect": "pass" - }, - { - "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "These control characters \u001a are apparently intentional \u0081", - "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", - "fixed": "These control characters are apparently intentional \u0081", - "expect": "pass" - }, - { - "label": "Synthetic, negative: U+1A on its own", - "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", - "original": "Here's a control character: \u001a", - "fixed-encoding": "Here's a control character: \u001a", - "fixed": "Here's a control character: ", - "expect": "pass" - }, - { - "label": "Synthetic, negative: A-with-circle as an Angstrom sign", - "comment": "Should not turn into '10 ŗ'", - "original": "a radius of 10 Å—", - "fixed": "a radius of 10 Å—", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", - "original": "!YO SÉ¡", - "fixed": "!YO SÉ¡", - "expect": "pass" - }, - { - "label": "Synthetic: fix text with backslashes in it", - "comment": "Tests for a regression on a long-ago bug", - "original": "<40\\% vs \u00e2\u0089\u00a540\\%", - "fixed": "<40\\% vs ≥40\\%", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", - "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", - "original": "“mismatched quotes…”", - "fixed-encoding": "“mismatched quotes…”", - "fixed": "\"mismatched quotes…\"", - "expect": "pass" - }, - { - "label": "Synthetic: lossy decoding in sloppy-windows-1252", - "original": "“lossy decodingâ€�", - "fixed-encoding": "“lossy decoding�", - "fixed": "\"lossy decoding�", - "expect": "pass" - }, - { - "label": "Synthetic: French word for August in windows-1252", - "original": "août", - "fixed-encoding": "août", - "fixed": "août", - "expect": "pass" - }, - { - "label": "Synthetic: French word for hotel in all-caps windows-1252", - "original": "HÔTEL", - "fixed-encoding": "HÔTEL", - "fixed": "HÔTEL", - "expect": "pass" - }, - { - "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", - "original": "CÙIS", - "fixed-encoding": "CÙIS", - "fixed": "CÙIS", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a non-breaking space", - "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", - "original": "NICIODATĂ\u00a0", - "fixed": "NICIODATĂ\u00a0", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Be careful around curly apostrophes", - "comment": "It shouldn't end up saying 'a lot of Òs'", - "original": "There are a lot of Ã’s in mojibake text", - "fixed-encoding": "There are a lot of Ã’s in mojibake text", - "fixed": "There are a lot of Ã's in mojibake text", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Romanian word before a trademark sign", - "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", - "original": "NICIODATĂ™", - "fixed": "NICIODATĂ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Lithuanian word before a trademark sign", - "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", - "original": "TRANSFORMATORIŲ™", - "fixed": "TRANSFORMATORIŲ™", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Norwegian capitalized nonsense", - "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", - "original": "HÅØYA ER BLÅØYD", - "fixed": "HÅØYA ER BLÅØYD", - "expect": "pass" - }, - { - "label": "Synthetic, negative: raised eyebrow kaomoji", - "original": "Ō¬o", - "fixed": "Ō¬o", - "expect": "pass" - }, - { - "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", - "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", - "original": "ПоздравЂаво", - "fixed": "ПоздравЂаво", - "expect": "pass" - }, - { - "label": "Synthetic: mojibake with trademark sign at the end of a word", - "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", - "original": "OÙ ET QUAND?", - "fixed": "OÙ ET QUAND?", - "expect": "pass" - } -] \ No newline at end of file diff --git a/tests/test_bytes.py b/tests/test_bytes.py index 7abde1f6..f141846d 100644 --- a/tests/test_bytes.py +++ b/tests/test_bytes.py @@ -1,16 +1,15 @@ -import pytest - from ftfy import guess_bytes from ftfy.bad_codecs.utf8_variants import IncrementalDecoder +import pytest -TEST_ENCODINGS = ["utf-16", "utf-8", "sloppy-windows-1252"] + +TEST_ENCODINGS = [ + 'utf-16', 'utf-8', 'sloppy-windows-1252' +] TEST_STRINGS = [ - "Renée\nFleming", - "Noël\nCoward", - "Señor\nCardgage", - "€ • £ • ¥", - "¿Qué?", + 'Renée\nFleming', 'Noël\nCoward', 'Señor\nCardgage', + '€ • £ • ¥', '¿Qué?' ] @@ -21,22 +20,22 @@ def test_guess_bytes(string): assert result_str == string assert result_encoding == encoding - if "\n" in string: - old_mac_bytes = string.replace("\n", "\r").encode("macroman") + if '\n' in string: + old_mac_bytes = string.replace('\n', '\r').encode('macroman') result_str, result_encoding = guess_bytes(old_mac_bytes) - assert result_str == string.replace("\n", "\r") + assert result_str == string.replace('\n', '\r') def test_guess_bytes_null(): - bowdlerized_null = b"null\xc0\x80separated" + bowdlerized_null = b'null\xc0\x80separated' result_str, result_encoding = guess_bytes(bowdlerized_null) - assert result_str == "null\x00separated" - assert result_encoding == "utf-8-variants" + assert result_str == 'null\x00separated' + assert result_encoding == 'utf-8-variants' def test_incomplete_sequences(): - test_bytes = b"surrogates: \xed\xa0\x80\xed\xb0\x80 / null: \xc0\x80" - test_string = "surrogates: \U00010000 / null: \x00" + test_bytes = b'surrogates: \xed\xa0\x80\xed\xb0\x80 / null: \xc0\x80' + test_string = 'surrogates: \U00010000 / null: \x00' # Test that we can feed this string to decode() in multiple pieces, and no # matter where the break between those pieces is, we get the same result. @@ -48,3 +47,4 @@ def test_incomplete_sequences(): got = decoder.decode(left, final=False) got += decoder.decode(right) assert got == test_string + diff --git a/tests/test_cases.json b/tests/test_cases.json new file mode 100644 index 00000000..2c93483f --- /dev/null +++ b/tests/test_cases.json @@ -0,0 +1,931 @@ +[ + { + "label": "Messy language names: Czech", + "comment": "This and several following examples came from the same language selector", + "original": "ÄŒeÅ¡tina", + "fixed": "Čeština", + "expect": "pass" + }, + { + "label": "Messy language names: Gaelic", + "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", + "original": "GÃ\u00a0idhlig", + "fixed": "Gàidhlig", + "expect": "pass" + }, + { + "label": "Messy language names: Lithuanian", + "original": "Lietuvių", + "fixed": "Lietuvių", + "expect": "pass" + }, + { + "label": "Messy language names: Slovak", + "original": "SlovenÄ�ina", + "fixed": "Sloven�ina", + "expect": "pass" + }, + { + "label": "Messy language names: Vietnamese", + "original": "Tiếng Việt", + "fixed": "Tiếng Việt", + "expect": "pass" + }, + { + "label": "Messy language names: Greek", + "original": "Ελληνικά", + "fixed": "Ελληνικά", + "expect": "pass" + }, + { + "label": "Messy language names: Bulgarian", + "original": "българÑ�ки език", + "fixed": "българ�ки език", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Serbian [Cyrillic]", + "original": "CрпÑ�ки [ћирилицом]", + "fixed": "Cрп�ки [ћирилицом]", + "expect": "pass" + }, + { + "label": "Messy language names: Hebrew", + "original": "עברית", + "fixed": "עברית", + "expect": "pass" + }, + { + "label": "Messy language names: Russian", + "original": "РуÑ�Ñ�кий", + "fixed": "Ру��кий", + "expect": "pass" + }, + { + "label": "Messy language names: Hindi", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "हिनà¥�दी", + "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", + "expect": "pass" + }, + { + "label": "Messy language names: Tamil", + "comment": "My terminal has difficulty rendering the mostly-fixed text", + "original": "தமிழà¯�", + "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", + "expect": "pass" + }, + { + "label": "Messy language names: Thai", + "original": "ภาษาไทย", + "fixed": "ภาษาไทย", + "expect": "pass" + }, + { + "label": "Messy language names: Simplified Chinese", + "original": "简体ä¸\u00adæ–‡", + "fixed": "简体中文", + "expect": "pass" + }, + { + "label": "Messy language names: Traditional Chinese", + "original": "æ\u00ad£é«”ä¸\u00adæ–‡", + "fixed": "正體中文", + "expect": "pass" + }, + { + "label": "Messy language names: Japanese", + "original": "日本語", + "fixed": "日本語", + "expect": "pass" + }, + { + "label": "Messy language names: Korean", + "original": "한êµ\u00adì–´", + "fixed": "한국어", + "expect": "pass" + }, + { + "label": "Low-codepoint emoji", + "original": "He's Justinâ\u009d¤", + "fixed": "He's Justin❤", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mix-up about smurfs", + "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", + "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", + "expect": "pass" + }, + { + "label": "Checkmark that almost looks okay as mojibake", + "original": "✔ No problems", + "fixed": "✔ No problems", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 Russian mixup about futbol", + "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", + "fixed": "дороге Из-под #футбол", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in German", + "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", + "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", + "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup of the replacement character", + "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", + "expect": "pass" + }, + { + "label": "CESU-8 / Windows-1252 emoji", + "original": "Hi guys í ½í¸\u008d", + "fixed": "Hi guys 😍", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 emoji", + "original": "hihi RT username: â\u0098ºí ½í¸\u0098", + "fixed": "hihi RT username: ☺😘", + "expect": "pass" + }, + { + "label": "Latin-1 / Windows-1252 mixup in Turkish", + "original": "Beta Haber: Hırsızı Büyü Korkuttu", + "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixed up twice in Russian", + "original": "приятности. РІСњВ¤", + "fixed": "приятности. ❤", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in Malay", + "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", + "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", + "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", + "original": "Iggy Pop (né Jim Osterberg)", + "fixed": "Iggy Pop (né Jim Osterberg)", + "expect": "pass" + }, + { + "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", + "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", + "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", + "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", + "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", + "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixed up three times", + "original": "The Mona Lisa doesn’t have eyebrows.", + "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", + "fixed": "The Mona Lisa doesn't have eyebrows.", + "expect": "pass" + }, + { + "label": "UTF-8 / Codepage 437 mixup in Russian", + "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", + "fixed": "#правильноепитание", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in French", + "original": "Hôtel de Police", + "fixed": "Hôtel de Police", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in French", + "original": "Liège Avenue de l'HĂ´pital", + "fixed": "Liège Avenue de l'Hôpital", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Vietnamese", + "original": "Tại sao giá hạt sầu riêng lại lên giá?", + "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", + "expect": "pass" + }, + { + "label": "Negative: using diaereses as quotation marks in Greek", + "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", + "expect": "pass" + }, + { + "label": "Science! Mid-word Greek letter gets fixed correctly", + "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", + "expect": "pass" + }, + { + "label": "Negative: More science! Don't fix a multiplication symbol in quotes", + "original": "higher values (“+” and “×” curves) in the superficial region", + "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", + "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", + "expect": "pass" + }, + { + "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", + "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", + "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", + "expect": "pass" + }, + { + "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", + "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", + "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", + "expect": "pass" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in English", + "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", + "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", + "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", + "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", + "comment": "A difficult test case that can depend on the order that steps are applied", + "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", + "expect": "pass" + }, + { + "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", + "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", + "original": "أكثر من Ù Ù Ù¡ بلد", + "fixed": "أكثر من ٠٠١ بلد", + "expect": "fail" + }, + { + "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", + "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", + "fixed": "vedere înceţoşată", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1250 mixup in Slovak", + "original": "NapĂ\u00adšte nám !", + "fixed": "Napíšte nám !", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", + "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", + "expect": "pass" + }, + { + "label": "fancy Unicode crossing-out, but mojibaked", + "original": "hotel $49 $̶6̶3̶ updated 2018", + "fixed": "hotel $49 $̶6̶3̶ updated 2018", + "expect": "pass" + }, + { + "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", + "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", + "fixed": "┒(⌣˛⌣)┎", + "expect": "pass" + }, + { + "label": "We can mostly decode the face above when we lose the character U+009D", + "original": "ââ€�’(⌣˛⌣)ââ€�Ž", + "fixed": "�(⌣˛⌣)�", + "expect": "pass" + }, + { + "label": "Lossy decoding can have plain ASCII question marks, as well", + "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", + "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", + "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", + "expect": "pass" + }, + { + "label": "CESU-8 / Latin-1 mixup over several emoji", + "comment": "You tried", + "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", + "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", + "expect": "pass" + }, + { + "label": "Two levels of inconsistent mojibake", + "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", + "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", + "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", + "expect": "fail" + }, + { + "label": "An absolutely hopeless garble", + "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", + "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", + "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", + "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", + "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", + "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", + "expect": "pass" + }, + { + "label": "Inconsistent mojibake in Portuguese", + "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", + "expect": "pass" + }, + { + "label": "Handle Afrikaans 'n character", + "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", + "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", + "expect": "pass" + }, + { + "label": "Handle Croatian single-codepoint digraphs", + "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", + "expect": "pass" + }, + { + "label": "A with an acute accent, in isolation", + "original": "Nicolás", + "fixed": "Nicolás", + "expect": "pass" + }, + { + "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", + "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", + "expect": "pass" + }, + { + "label": "Negative: multiplication sign and ellipsis", + "comment": "Should not turn into a dot below", + "original": "4288×…", + "fixed": "4288×…", + "expect": "pass" + }, + { + "label": "Negative: accents are sometimes used as quotes", + "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", + "original": "``toda produzida pronta pra assa aí´´", + "fixed": "``toda produzida pronta pra assa aí´´", + "expect": "pass" + }, + { + "label": "Negative: 'Õ' followed by an ellipsis", + "comment": "Should not turn into the Armenian letter Յ", + "original": "HUHLL Õ…", + "fixed": "HUHLL Õ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ê' followed by an ellipsis", + "comment": "Should not turn into a squat reversed esh", + "original": "RETWEET SE VOCÊ…", + "fixed": "RETWEET SE VOCÊ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by an ellipsis", + "comment": "Should not turn into 'MARQUɅ'", + "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", + "expect": "pass" + }, + { + "label": "Negative: 'Ó' followed by an ellipsis", + "comment": "Should not turn into 'SӅ'", + "original": "TEM QUE SEGUIR, SDV SÓ…", + "fixed": "TEM QUE SEGUIR, SDV SÓ…", + "expect": "pass" + }, + { + "label": "Negative: 'É' followed by a curly apostrophe", + "comment": "Should not turn into 'ZZAJɒs'", + "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", + "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", + "expect": "pass" + }, + { + "label": "Negative: 'é' preceded by curly apostrophe", + "comment": "Should not turn into 'LՎpisode'", + "original": "L’épisode 8 est trop fou ouahh", + "fixed-encoding": "L’épisode 8 est trop fou ouahh", + "fixed": "L'épisode 8 est trop fou ouahh", + "expect": "pass" + }, + { + "label": "Negative: three raised eyebrows or something?", + "comment": "Should not turn into private use character U+F659", + "original": "Ôôô VIDA MINHA", + "fixed": "Ôôô VIDA MINHA", + "expect": "pass" + }, + { + "label": "Negative: copyright sign preceded by non-breaking space", + "comment": "Should not turn into 'ʩ'", + "original": "[x]\u00a0©", + "fixed": "[x]\u00a0©", + "expect": "pass" + }, + { + "label": "Negative: en dash and infinity sign", + "comment": "Should not turn into '2012Ѱ'", + "original": "2012—∞", + "fixed": "2012—∞", + "expect": "pass" + }, + { + "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", + "original": "SENSЕ - Oleg Tsedryk", + "fixed": "SENSЕ - Oleg Tsedryk", + "expect": "pass" + }, + { + "label": "Negative: angry face", + "comment": "The face should not turn into '`«'", + "original": "OK??:( `¬´ ):", + "fixed": "OK??:( `¬´ ):", + "expect": "pass" + }, + { + "label": "Negative, synthetic: face with glasses and a raised eyebrow", + "original": "( o¬ô )", + "fixed": "( o¬ô )", + "expect": "pass" + }, + { + "label": "Negative: triangle and degree sign", + "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", + "original": "∆°", + "fixed": "∆°", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with inverted question mark", + "comment": "Former false positive - it should not turn into 'QUEM ɿ'", + "original": "ESSE CARA AI QUEM É¿", + "fixed": "ESSE CARA AI QUEM É¿", + "expect": "pass" + }, + { + "label": "Negative: Portuguese with acute accents as quotation marks", + "comment": "Former false positive - the end should not turn into a superscript H", + "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", + "expect": "pass" + }, + { + "label": "Negative: Finnish Ä followed by a non-breaking space", + "comment": "Former false positive - should not become a G with a dot", + "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", + "expect": "pass" + }, + { + "label": "Negative: multiplying by currency", + "comment": "Former false positive - should not become the Hebrew letter 'final pe'", + "original": "Offering 5×£35 pin ups", + "fixed": "Offering 5×£35 pin ups", + "expect": "pass" + }, + { + "label": "Negative: registered chocolate brand name", + "comment": "Former false positive - should not become the IPA letter 'lezh'", + "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", + "expect": "pass" + }, + { + "label": "Mostly negative: we only need to fix C1 control characters", + "comment": "We should not decode 'é\u0085 ' as '酠'", + "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", + "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", + "expect": "pass" + }, + { + "label": "French example containing non-breaking spaces", + "original": "ART TRIP Ã\u00a0 l'office de tourisme", + "fixed": "ART TRIP à l'office de tourisme", + "expect": "pass" + }, + { + "label": "English example in UTF-8 / Windows-1251 with a ligature", + "original": "This is signiп¬Ѓcantly lower than the respective share", + "fixed-encoding": "This is significantly lower than the respective share", + "fixed": "This is significantly lower than the respective share", + "expect": "pass" + }, + { + "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", + "original": "voilà le travail", + "fixed": "voilà le travail", + "expect": "pass" + }, + { + "label": "Negative: We don't fix à in all contexts", + "original": "C O N C L U S à O", + "fixed": "C O N C L U S à O", + "expect": "pass" + }, + { + "label": "'à' remains its own word, even if spaces after it get coalesced into one", + "original": "à perturber la réflexion des théologiens jusqu'à nos jours", + "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", + "expect": "pass" + }, + { + "label": "Fix 'à' in inconsistent mojibake", + "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", + "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", + "expect": "pass" + }, + { + "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", + "original": "com especial atenção à s crianças", + "fixed": "com especial atenção às crianças", + "expect": "pass" + }, + { + "label": "This is why we require a space after the 's' in 'às'", + "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", + "expect": "pass" + }, + { + "label": "We can fix 'à' in windows-1251 sometimes as well", + "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", + "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", + "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", + "expect": "pass" + }, + { + "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", + "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", + "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", + "expect": "pass" + }, + { + "label": "A complex, lossy pile-up of mojibake in Portuguese", + "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", + "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", + "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", + "fixed": "Cànan nan Gàidheal", + "expect": "pass" + }, + { + "label": "Misleading mix-up in Spanish", + "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", + "original": "tiene demora y está \u0093próximo a resolverse\u0094", + "fixed": "tiene demora y está \"próximo a resolverse\"", + "expect": "fail" + }, + { + "label": "A-with-grave in Vietnamese", + "comment": "Currently adds extra spaces that shouldn't be there", + "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", + "fixed": "Xem clip hài, phim hài mới hay nhất", + "expect": "fail" + }, + { + "label": "Punctuation pile-up should actually be musical notes", + "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", + "expect": "pass" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish", + "comment": "Requires something like encoding detection", + "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", + "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", + "expect": "fail" + }, + { + "label": "subtle UTF-8 / codepage 437 mixup in Spanish", + "original": "┬┐que diferencia hay?", + "fixed": "¿que diferencia hay?", + "expect": "fail" + }, + { + "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", + "comment": "Requires something like encoding detection", + "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", + "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", + "expect": "fail" + }, + { + "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", + "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", + "fixed": "faites attention à bien vous renseigner avant sur le médicament", + "expect": "fail" + }, + { + "label": "UTF-8 / Windows-1251 mixup in tweet spam", + "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", + "expect": "pass" + }, + { + "label": "UTF-8 / Windows-1251 mixup", + "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", + "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", + "expect": "pass" + }, + { + "label": "Dutch example with ë", + "comment": "from issue reported by MicroJackson", + "original": "ongeëvenaard", + "fixed-encoding": "ongeëvenaard", + "fixed": "ongeëvenaard", + "expect": "pass" + }, + { + "label": "Negative: Indonesian leetspeak", + "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", + "expect": "pass" + }, + { + "label": "Three layers of UTF-8 / MacRoman mixup in French", + "comment": "You're welcome", + "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", + "fixed": "Merci de télécharger le plug-in Flash Player 8", + "expect": "pass" + }, + { + "label": "UTF-8 / MacRoman mixup in French", + "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", + "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", + "expect": "pass" + }, + { + "label": "Italian UTF-8 / MacRoman example with ò", + "original": "Le Vigne di Zam√≤", + "fixed": "Le Vigne di Zamò", + "expect": "pass" + }, + { + "label": "Italian UTF-8 / MacRoman mojibake that looks like math", + "comment": "False negative: 'pi√π' is a bit too reasonable to fix", + "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", + "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", + "expect": "fail" + }, + { + "label": "Hebrew UTF-8 / Windows-1252 mojibake", + "comment": "reported by SuperIRabbit as issue #158", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", + "original": "בהודעה", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", + "original": "◊ë◊î◊ï◊ì◊¢◊î", + "fixed": "בהודעה", + "expect": "pass" + }, + { + "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", + "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", + "original": "×\u0090×\u0091×\u0091×\u0090", + "fixed": "אבבא", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", + "original": "رسالة", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", + "original": "ÿ±ÿ≥ÿߟÑÿ©", + "fixed": "رسالة", + "expect": "pass" + }, + { + "label": "Negative: math in Unicode", + "comment": "This isn't mojibake, it's an actual equation", + "original": "(-1/2)! = √π", + "fixed": "(-1/2)! = √π", + "expect": "pass" + }, + { + "label": "Negative: Leet line-art", + "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", + "original": "├┤a┼┐a┼┐a┼┐a┼┐a", + "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", + "comment": "The original example of why ftfy needs heuristics", + "original": "I'm not such a fan of Charlotte Brontë…”", + "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", + "fixed": "I'm not such a fan of Charlotte Brontë…\"", + "expect": "pass" + }, + { + "label": "Synthetic, negative: hypothetical Swedish product name", + "comment": "This used to be a constructed example of a false positive, until you added another symbol", + "original": "AHÅ™, the new sofa from IKEA", + "fixed": "AHÅ™, the new sofa from IKEA", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Ukrainian capital letters", + "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", + "original": "ВІКІ is Ukrainian for WIKI", + "fixed": "ВІКІ is Ukrainian for WIKI", + "expect": "pass" + }, + { + "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "These control characters \u001a are apparently intentional \u0081", + "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", + "fixed": "These control characters are apparently intentional \u0081", + "expect": "pass" + }, + { + "label": "Synthetic, negative: U+1A on its own", + "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", + "original": "Here's a control character: \u001a", + "fixed-encoding": "Here's a control character: \u001a", + "fixed": "Here's a control character: ", + "expect": "pass" + }, + { + "label": "Synthetic, negative: A-with-circle as an Angstrom sign", + "comment": "Should not turn into '10 ŗ'", + "original": "a radius of 10 Å—", + "fixed": "a radius of 10 Å—", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", + "original": "!YO SÉ¡", + "fixed": "!YO SÉ¡", + "expect": "pass" + }, + { + "label": "Synthetic: fix text with backslashes in it", + "comment": "Tests for a regression on a long-ago bug", + "original": "<40\\% vs \u00e2\u0089\u00a540\\%", + "fixed": "<40\\% vs ≥40\\%", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", + "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", + "original": "“mismatched quotes…”", + "fixed-encoding": "“mismatched quotes…”", + "fixed": "\"mismatched quotes…\"", + "expect": "pass" + }, + { + "label": "Synthetic: lossy decoding in sloppy-windows-1252", + "original": "“lossy decodingâ€�", + "fixed-encoding": "“lossy decoding�", + "fixed": "\"lossy decoding�", + "expect": "pass" + }, + { + "label": "Synthetic: French word for August in windows-1252", + "original": "août", + "fixed-encoding": "août", + "fixed": "août", + "expect": "pass" + }, + { + "label": "Synthetic: French word for hotel in all-caps windows-1252", + "original": "HÔTEL", + "fixed-encoding": "HÔTEL", + "fixed": "HÔTEL", + "expect": "pass" + }, + { + "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", + "original": "CÙIS", + "fixed-encoding": "CÙIS", + "fixed": "CÙIS", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a non-breaking space", + "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", + "original": "NICIODATĂ\u00a0", + "fixed": "NICIODATĂ\u00a0", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Be careful around curly apostrophes", + "comment": "It shouldn't end up saying 'a lot of Òs'", + "original": "There are a lot of Ã’s in mojibake text", + "fixed-encoding": "There are a lot of Ã’s in mojibake text", + "fixed": "There are a lot of Ã's in mojibake text", + "expect": "pass" + }, + { + "label": "Synthetic, negative: Romanian word before a trademark sign", + "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", + "original": "NICIODATĂ™", + "fixed": "NICIODATĂ™", + "expect": "pass" + }, + { + "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", + "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", + "original": "MISUTÂ\u00a0AJIKKO", + "fixed": "MISUTÂ\u00a0AJIKKO", + "expect": "fail" + }, + { + "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", + "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", + "original": "ПоздравЂаво", + "fixed": "ПоздравЂаво", + "expect": "pass" + }, + { + "label": "Synthetic: mojibake with trademark sign at the end of a word", + "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", + "original": "OÙ ET QUAND?", + "fixed": "OÙ ET QUAND?", + "expect": "pass" + } +] diff --git a/tests/test_characters.py b/tests/test_characters.py index be28aa2f..91fa872b 100644 --- a/tests/test_characters.py +++ b/tests/test_characters.py @@ -1,20 +1,21 @@ from ftfy import ( - fix_and_explain, - fix_encoding, - fix_text, + fix_encoding, fix_encoding_and_explain, fix_text, fix_and_explain, apply_plan ) +from ftfy.fixes import remove_control_chars, fix_surrogates from ftfy.chardata import possible_encoding -from ftfy.fixes import fix_surrogates, remove_control_chars +from ftfy.badness import badness +import unicodedata +import sys def test_possible_encoding(): for codept in range(256): char = chr(codept) - assert possible_encoding(char, "latin-1") + assert possible_encoding(char, 'latin-1') def test_byte_order_mark(): - assert fix_encoding("") == "\ufeff" + assert fix_encoding('') == '\ufeff' def test_control_chars(): @@ -42,8 +43,8 @@ def test_ohio_flag(): def test_surrogates(): - assert fix_surrogates("\udbff\udfff") == "\U0010ffff" - assert fix_surrogates("\ud800\udc00") == "\U00010000" + assert fix_surrogates('\udbff\udfff') == '\U0010ffff' + assert fix_surrogates('\ud800\udc00') == '\U00010000' def test_color_escapes(): @@ -52,5 +53,5 @@ def test_color_escapes(): assert fixed == "foo" assert plan == [ ("apply", "remove_terminal_escapes"), - ("apply", "remove_control_chars"), + ("apply", "remove_control_chars") ] diff --git a/tests/test_cli.py b/tests/test_cli.py index a862e31d..3b27075b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,67 +1,64 @@ -import os import subprocess -from pathlib import Path - +import os import pytest -# Get the filename of 'face.txt', an example of mojibake -THIS_DIR = Path(__file__).parent -TEST_FILENAME = THIS_DIR / "face.txt" -CORRECT_OUTPUT = os.linesep.join(["┒(⌣˛⌣)┎", ""]) -FAILED_OUTPUT = os.linesep.join( - [ - "ftfy error:", - "This input couldn't be decoded as 'windows-1252'. We got the following error:", - "", - " 'charmap' codec can't decode byte 0x9d in position 4: character maps to ", - "", - "ftfy works best when its input is in a known encoding. You can use `ftfy -g`", - "to guess, if you're desperate. Otherwise, give the encoding name with the", - "`-e` option, such as `ftfy -e latin-1`.", - "", - ] -) + +# Get the filename of 'halibote.txt', which contains some mojibake about +# Harry Potter in Chinese +THIS_DIR = os.path.dirname(__file__) +TEST_FILENAME = os.path.join(THIS_DIR, 'halibote.txt') +CORRECT_OUTPUT = os.linesep.join(['【更新】《哈利波特》石堧卜才新婚娶初戀今痠逝', '']) +FAILED_OUTPUT = os.linesep.join([ + "ftfy error:", + "This input couldn't be decoded as 'windows-1252'. We got the following error:", + "", + " 'charmap' codec can't decode byte 0x90 in position 5: character maps to ", + "", + "ftfy works best when its input is in a known encoding. You can use `ftfy -g`", + "to guess, if you're desperate. Otherwise, give the encoding name with the", + "`-e` option, such as `ftfy -e latin-1`.", + "", +]) def get_command_output(args, stdin=None): - return subprocess.check_output(args, stdin=stdin, stderr=subprocess.STDOUT, timeout=5).decode( - "utf-8" - ) + return subprocess.check_output(args, stdin=stdin, stderr=subprocess.STDOUT, timeout=5).decode('utf-8') def test_basic(): - output = get_command_output(["ftfy", TEST_FILENAME]) + output = get_command_output(['ftfy', TEST_FILENAME]) assert output == CORRECT_OUTPUT def test_guess_bytes(): - output = get_command_output(["ftfy", "-g", TEST_FILENAME]) + output = get_command_output(['ftfy', '-g', TEST_FILENAME]) assert output == CORRECT_OUTPUT def test_alternate_encoding(): # The file isn't really in Windows-1252. But that's a problem ftfy # can fix, if it's allowed to be sloppy when reading the file. - output = get_command_output(["ftfy", "-e", "sloppy-windows-1252", TEST_FILENAME]) + output = get_command_output(['ftfy', '-e', 'sloppy-windows-1252', TEST_FILENAME]) assert output == CORRECT_OUTPUT def test_wrong_encoding(): # It's more of a problem when the file doesn't actually decode. with pytest.raises(subprocess.CalledProcessError) as exception: - get_command_output(["ftfy", "-e", "windows-1252", TEST_FILENAME]) - assert exception.value.output.decode("utf-8") == FAILED_OUTPUT + get_command_output(['ftfy', '-e', 'windows-1252', TEST_FILENAME]) + assert exception.value.output.decode('utf-8') == FAILED_OUTPUT def test_same_file(): with pytest.raises(subprocess.CalledProcessError) as exception: - get_command_output(["ftfy", TEST_FILENAME, "-o", TEST_FILENAME]) - error = exception.value.output.decode("utf-8") + get_command_output(['ftfy', TEST_FILENAME, '-o', TEST_FILENAME]) + error = exception.value.output.decode('utf-8') assert error.startswith("ftfy error:") assert "Can't read and write the same file" in error def test_stdin(): - with TEST_FILENAME.open("rb") as infile: - output = get_command_output(["ftfy"], stdin=infile) + with open(TEST_FILENAME, 'rb') as infile: + output = get_command_output(['ftfy'], stdin=infile) assert output == CORRECT_OUTPUT + diff --git a/tests/test_encodings.py b/tests/test_encodings.py index c3c9c2e4..2af573c7 100644 --- a/tests/test_encodings.py +++ b/tests/test_encodings.py @@ -2,19 +2,20 @@ def test_cesu8(): - cls1 = bad_codecs.search_function("cesu8").__class__ - cls2 = bad_codecs.search_function("cesu-8").__class__ + cls1 = bad_codecs.search_function('cesu8').__class__ + cls2 = bad_codecs.search_function('cesu-8').__class__ assert cls1 == cls2 - test_bytes = b"\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, and \xc0\x80 is null" - test_text = "\U00077777 is an unassigned character, and \x00 is null" - assert test_bytes.decode("cesu8") == test_text + test_bytes = (b'\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, ' + b'and \xc0\x80 is null') + test_text = '\U00077777 is an unassigned character, and \x00 is null' + assert test_bytes.decode('cesu8') == test_text def test_russian_crash(): - thebytes = b"\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff " + thebytes = b'\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff ' # We don't care what the result is, but this shouldn't crash - thebytes.decode("utf-8-variants", "replace") - + thebytes.decode('utf-8-variants', 'replace') + # This shouldn't crash either guess_bytes(thebytes) diff --git a/tests/test_entities.py b/tests/test_entities.py index 62b3d673..f05880ee 100644 --- a/tests/test_entities.py +++ b/tests/test_entities.py @@ -1,45 +1,35 @@ -import pytest - from ftfy import fix_text, fix_text_segment from ftfy.fixes import unescape_html def test_entities(): - example = "&\n\n&" - assert fix_text(example) == "&\n\n&" - assert fix_text_segment(example) == "&\n\n&" - - assert fix_text(example, unescape_html=True) == "&\n\n&" - assert fix_text_segment(example, unescape_html=True) == "&\n\n&" - - assert fix_text(example, unescape_html=False) == "&\n\n&" - assert fix_text_segment(example, unescape_html=False) == "&\n\n&" - - assert fix_text_segment("<>", unescape_html=False) == "<>" - assert fix_text_segment("<>", unescape_html=True) == "<>" - assert fix_text_segment("<>") == "<>" - assert fix_text_segment("jednocześnie") == "jednocześnie" - assert fix_text_segment("JEDNOCZEŚNIE") == "JEDNOCZEŚNIE" - assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..." - assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..." - assert fix_text_segment("broken") == "broken\x81" - assert fix_text_segment("&amp;amp;") == "&" - assert unescape_html("euro €") == "euro €" - assert unescape_html("EURO &EURO;") == "EURO €" - assert unescape_html("not an entity x6;") == "not an entity x6;" - assert unescape_html("JEDNOCZE&SACUTE;NIE") == "JEDNOCZEŚNIE" - assert unescape_html("V&SCARON;ICHNI") == "VŠICHNI" - assert unescape_html("￿") == "" - assert unescape_html("�") == "\ufffd" + example = '&\n\n&' + assert fix_text(example) == '&\n\n&' + assert fix_text_segment(example) == '&\n\n&' + + assert fix_text(example, unescape_html=True) == '&\n\n&' + assert fix_text_segment(example, unescape_html=True) == '&\n\n&' + + assert fix_text(example, unescape_html=False) == '&\n\n&' + assert fix_text_segment(example, unescape_html=False) == '&\n\n&' + + assert fix_text_segment('<>', unescape_html=False) == '<>' + assert fix_text_segment('<>', unescape_html=True) == '<>' + assert fix_text_segment('<>') == '<>' + assert fix_text_segment('jednocześnie') == 'jednocześnie' + assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE' + assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...' + assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...' + assert fix_text_segment('broken') == 'broken\x81' + assert fix_text_segment('&amp;amp;') == '&' + assert unescape_html('euro €') == 'euro €' + assert unescape_html('EURO &EURO;') == 'EURO €' + assert unescape_html('not an entity x6;') == 'not an entity x6;' + assert unescape_html('JEDNOCZE&SACUTE;NIE') == 'JEDNOCZEŚNIE' + assert unescape_html('V&SCARON;ICHNI') == 'VŠICHNI' + assert unescape_html('￿') == '' + assert unescape_html('�') == '\ufffd' assert ( - fix_text_segment("this is just informal english ¬ html") - == "this is just informal english ¬ html" + fix_text_segment('this is just informal english ¬ html') == + 'this is just informal english ¬ html' ) - - -def test_old_parameter_name(): - example = "&\n\n&" - with pytest.deprecated_call(): - assert fix_text(example, fix_entities=True) == "&\n\n&" - with pytest.deprecated_call(): - assert fix_text(example, fix_entities=False) == "&\n\n&" diff --git a/tests/test_examples_in_json.py b/tests/test_examples_in_json.py index 2be9eb4e..7ccde363 100644 --- a/tests/test_examples_in_json.py +++ b/tests/test_examples_in_json.py @@ -23,41 +23,30 @@ If missing, it will be considered to be the same as "fixed". - "comment": possibly-enlightening commentary on the test case. """ - +from ftfy import fix_text, fix_and_explain, fix_encoding_and_explain, apply_plan import json -from pathlib import Path - +import os import pytest -from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text - -THIS_DIR = Path(__file__).parent -TEST_CASE_DIR = THIS_DIR / "test-cases" - -def load_test_data() -> list[dict]: - test_data = [] - for filepath in TEST_CASE_DIR.glob("*.json"): - test_data.extend(json.load(filepath.open())) - return test_data +THIS_DIR = os.path.dirname(__file__) +TEST_FILENAME = os.path.join(THIS_DIR, 'test_cases.json') +TEST_DATA = json.load(open(TEST_FILENAME, encoding='utf-8')) - -TEST_DATA = load_test_data() - -TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] -TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"] +TESTS_THAT_PASS = [test for test in TEST_DATA if test['expect'] == 'pass'] +TESTS_THAT_FAIL = [test for test in TEST_DATA if test['expect'] == 'fail'] @pytest.mark.parametrize("test_case", TEST_DATA) def test_well_formed_example(test_case): - assert test_case["expect"] in ("pass", "fail") + assert test_case['expect'] in ('pass', 'fail') @pytest.mark.parametrize("test_case", TESTS_THAT_PASS) def test_json_example(test_case): # Run one example from the data file - orig = test_case["original"] - fixed = test_case["fixed"] + orig = test_case['original'] + fixed = test_case['fixed'] # Make sure that we can fix the text as intended assert fix_text(orig) == fixed @@ -66,35 +55,32 @@ def test_json_example(test_case): # run to reproduce its result fixed_output, plan = fix_and_explain(orig) assert apply_plan(orig, plan) == fixed_output - + # Do the same for fix_encoding_and_explain encoding_fix, plan = fix_encoding_and_explain(orig) assert apply_plan(orig, plan) == encoding_fix # Ask for the encoding fix a different way, by disabling all the other steps # in the config object - assert ( - fix_text( - orig, - unescape_html=False, - remove_terminal_escapes=False, - fix_character_width=False, - fix_latin_ligatures=False, - uncurl_quotes=False, - fix_line_breaks=False, - fix_surrogates=False, - remove_control_chars=False, - normalization=None, - ) - == encoding_fix - ) + assert fix_text( + orig, + unescape_html=False, + remove_terminal_escapes=False, + fix_character_width=False, + fix_latin_ligatures=False, + uncurl_quotes=False, + fix_line_breaks=False, + fix_surrogates=False, + remove_control_chars=False, + normalization=None + ) == encoding_fix # Make sure we can decode the text as intended assert fix_text(orig) == fixed - assert encoding_fix == test_case.get("fixed-encoding", fixed) + assert encoding_fix == test_case.get('fixed-encoding', fixed) # Make sure we can decode as intended even with an extra layer of badness - extra_bad = orig.encode("utf-8").decode("latin-1") + extra_bad = orig.encode('utf-8').decode('latin-1') assert fix_text(extra_bad) == fixed @@ -103,8 +89,9 @@ def test_json_example(test_case): def test_failing_json_example(test_case): # Run an example from the data file that we believe will fail, due to # ftfy's heuristic being insufficient - orig = test_case["original"] - fixed = test_case["fixed"] + orig = test_case['original'] + fixed = test_case['fixed'] encoding_fix, plan = fix_encoding_and_explain(orig) - assert encoding_fix == test_case.get("fixed-encoding", fixed) + assert encoding_fix == test_case.get('fixed-encoding', fixed) + diff --git a/tox.ini b/tox.ini index ec356b7c..8945ab47 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,6 @@ [tox] -envlist = py39, py310, py311, py312, py313 +envlist = py35, py36, py37, py38, py39 [testenv] -deps = - pytest - wcwidth +deps = pytest commands = pytest diff --git a/uv.lock b/uv.lock deleted file mode 100644 index 438359fb..00000000 --- a/uv.lock +++ /dev/null @@ -1,546 +0,0 @@ -version = 1 -requires-python = ">=3.9" - -[[package]] -name = "alabaster" -version = "0.7.16" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511 }, -] - -[[package]] -name = "babel" -version = "2.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, -] - -[[package]] -name = "beautifulsoup4" -version = "4.12.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/ca/824b1195773ce6166d388573fc106ce56d4a805bd7427b624e063596ec58/beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051", size = 581181 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed", size = 147925 }, -] - -[[package]] -name = "certifi" -version = "2024.8.30" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/ee/9b19140fe824b367c04c5e1b369942dd754c4c5462d5674002f75c4dedc1/certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9", size = 168507 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 }, -] - -[[package]] -name = "charset-normalizer" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f2/4f/e1808dc01273379acc506d18f1504eb2d299bd4131743b9fc54d7be4df1e/charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e", size = 106620 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/8b/825cc84cf13a28bfbcba7c416ec22bf85a9584971be15b21dd8300c65b7f/charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6", size = 196363 }, - { url = "https://files.pythonhosted.org/packages/23/81/d7eef6a99e42c77f444fdd7bc894b0ceca6c3a95c51239e74a722039521c/charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b", size = 125639 }, - { url = "https://files.pythonhosted.org/packages/21/67/b4564d81f48042f520c948abac7079356e94b30cb8ffb22e747532cf469d/charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99", size = 120451 }, - { url = "https://files.pythonhosted.org/packages/c2/72/12a7f0943dd71fb5b4e7b55c41327ac0a1663046a868ee4d0d8e9c369b85/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca", size = 140041 }, - { url = "https://files.pythonhosted.org/packages/67/56/fa28c2c3e31217c4c52158537a2cf5d98a6c1e89d31faf476c89391cd16b/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d", size = 150333 }, - { url = "https://files.pythonhosted.org/packages/f9/d2/466a9be1f32d89eb1554cf84073a5ed9262047acee1ab39cbaefc19635d2/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7", size = 142921 }, - { url = "https://files.pythonhosted.org/packages/f8/01/344ec40cf5d85c1da3c1f57566c59e0c9b56bcc5566c08804a95a6cc8257/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3", size = 144785 }, - { url = "https://files.pythonhosted.org/packages/73/8b/2102692cb6d7e9f03b9a33a710e0164cadfce312872e3efc7cfe22ed26b4/charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907", size = 146631 }, - { url = "https://files.pythonhosted.org/packages/d8/96/cc2c1b5d994119ce9f088a9a0c3ebd489d360a2eb058e2c8049f27092847/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b", size = 140867 }, - { url = "https://files.pythonhosted.org/packages/c9/27/cde291783715b8ec30a61c810d0120411844bc4c23b50189b81188b273db/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912", size = 149273 }, - { url = "https://files.pythonhosted.org/packages/3a/a4/8633b0fc1a2d1834d5393dafecce4a1cc56727bfd82b4dc18fc92f0d3cc3/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95", size = 152437 }, - { url = "https://files.pythonhosted.org/packages/64/ea/69af161062166b5975ccbb0961fd2384853190c70786f288684490913bf5/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e", size = 150087 }, - { url = "https://files.pythonhosted.org/packages/3b/fd/e60a9d9fd967f4ad5a92810138192f825d77b4fa2a557990fd575a47695b/charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe", size = 145142 }, - { url = "https://files.pythonhosted.org/packages/6d/02/8cb0988a1e49ac9ce2eed1e07b77ff118f2923e9ebd0ede41ba85f2dcb04/charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc", size = 94701 }, - { url = "https://files.pythonhosted.org/packages/d6/20/f1d4670a8a723c46be695dff449d86d6092916f9e99c53051954ee33a1bc/charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749", size = 102191 }, - { url = "https://files.pythonhosted.org/packages/9c/61/73589dcc7a719582bf56aae309b6103d2762b526bffe189d635a7fcfd998/charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c", size = 193339 }, - { url = "https://files.pythonhosted.org/packages/77/d5/8c982d58144de49f59571f940e329ad6e8615e1e82ef84584c5eeb5e1d72/charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944", size = 124366 }, - { url = "https://files.pythonhosted.org/packages/bf/19/411a64f01ee971bed3231111b69eb56f9331a769072de479eae7de52296d/charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee", size = 118874 }, - { url = "https://files.pythonhosted.org/packages/4c/92/97509850f0d00e9f14a46bc751daabd0ad7765cff29cdfb66c68b6dad57f/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c", size = 138243 }, - { url = "https://files.pythonhosted.org/packages/e2/29/d227805bff72ed6d6cb1ce08eec707f7cfbd9868044893617eb331f16295/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6", size = 148676 }, - { url = "https://files.pythonhosted.org/packages/13/bc/87c2c9f2c144bedfa62f894c3007cd4530ba4b5351acb10dc786428a50f0/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea", size = 141289 }, - { url = "https://files.pythonhosted.org/packages/eb/5b/6f10bad0f6461fa272bfbbdf5d0023b5fb9bc6217c92bf068fa5a99820f5/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc", size = 142585 }, - { url = "https://files.pythonhosted.org/packages/3b/a0/a68980ab8a1f45a36d9745d35049c1af57d27255eff8c907e3add84cf68f/charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5", size = 144408 }, - { url = "https://files.pythonhosted.org/packages/d7/a1/493919799446464ed0299c8eef3c3fad0daf1c3cd48bff9263c731b0d9e2/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594", size = 139076 }, - { url = "https://files.pythonhosted.org/packages/fb/9d/9c13753a5a6e0db4a0a6edb1cef7aee39859177b64e1a1e748a6e3ba62c2/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c", size = 146874 }, - { url = "https://files.pythonhosted.org/packages/75/d2/0ab54463d3410709c09266dfb416d032a08f97fd7d60e94b8c6ef54ae14b/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365", size = 150871 }, - { url = "https://files.pythonhosted.org/packages/8d/c9/27e41d481557be53d51e60750b85aa40eaf52b841946b3cdeff363105737/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129", size = 148546 }, - { url = "https://files.pythonhosted.org/packages/ee/44/4f62042ca8cdc0cabf87c0fc00ae27cd8b53ab68be3605ba6d071f742ad3/charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236", size = 143048 }, - { url = "https://files.pythonhosted.org/packages/01/f8/38842422988b795220eb8038745d27a675ce066e2ada79516c118f291f07/charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99", size = 94389 }, - { url = "https://files.pythonhosted.org/packages/0b/6e/b13bd47fa9023b3699e94abf565b5a2f0b0be6e9ddac9812182596ee62e4/charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27", size = 101752 }, - { url = "https://files.pythonhosted.org/packages/d3/0b/4b7a70987abf9b8196845806198975b6aab4ce016632f817ad758a5aa056/charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6", size = 194445 }, - { url = "https://files.pythonhosted.org/packages/50/89/354cc56cf4dd2449715bc9a0f54f3aef3dc700d2d62d1fa5bbea53b13426/charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf", size = 125275 }, - { url = "https://files.pythonhosted.org/packages/fa/44/b730e2a2580110ced837ac083d8ad222343c96bb6b66e9e4e706e4d0b6df/charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db", size = 119020 }, - { url = "https://files.pythonhosted.org/packages/9d/e4/9263b8240ed9472a2ae7ddc3e516e71ef46617fe40eaa51221ccd4ad9a27/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1", size = 139128 }, - { url = "https://files.pythonhosted.org/packages/6b/e3/9f73e779315a54334240353eaea75854a9a690f3f580e4bd85d977cb2204/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03", size = 149277 }, - { url = "https://files.pythonhosted.org/packages/1a/cf/f1f50c2f295312edb8a548d3fa56a5c923b146cd3f24114d5adb7e7be558/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284", size = 142174 }, - { url = "https://files.pythonhosted.org/packages/16/92/92a76dc2ff3a12e69ba94e7e05168d37d0345fa08c87e1fe24d0c2a42223/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15", size = 143838 }, - { url = "https://files.pythonhosted.org/packages/a4/01/2117ff2b1dfc61695daf2babe4a874bca328489afa85952440b59819e9d7/charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8", size = 146149 }, - { url = "https://files.pythonhosted.org/packages/f6/9b/93a332b8d25b347f6839ca0a61b7f0287b0930216994e8bf67a75d050255/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2", size = 140043 }, - { url = "https://files.pythonhosted.org/packages/ab/f6/7ac4a01adcdecbc7a7587767c776d53d369b8b971382b91211489535acf0/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719", size = 148229 }, - { url = "https://files.pythonhosted.org/packages/9d/be/5708ad18161dee7dc6a0f7e6cf3a88ea6279c3e8484844c0590e50e803ef/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631", size = 151556 }, - { url = "https://files.pythonhosted.org/packages/5a/bb/3d8bc22bacb9eb89785e83e6723f9888265f3a0de3b9ce724d66bd49884e/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b", size = 149772 }, - { url = "https://files.pythonhosted.org/packages/f7/fa/d3fc622de05a86f30beea5fc4e9ac46aead4731e73fd9055496732bcc0a4/charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565", size = 144800 }, - { url = "https://files.pythonhosted.org/packages/9a/65/bdb9bc496d7d190d725e96816e20e2ae3a6fa42a5cac99c3c3d6ff884118/charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7", size = 94836 }, - { url = "https://files.pythonhosted.org/packages/3e/67/7b72b69d25b89c0b3cea583ee372c43aa24df15f0e0f8d3982c57804984b/charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9", size = 102187 }, - { url = "https://files.pythonhosted.org/packages/f3/89/68a4c86f1a0002810a27f12e9a7b22feb198c59b2f05231349fbce5c06f4/charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114", size = 194617 }, - { url = "https://files.pythonhosted.org/packages/4f/cd/8947fe425e2ab0aa57aceb7807af13a0e4162cd21eee42ef5b053447edf5/charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed", size = 125310 }, - { url = "https://files.pythonhosted.org/packages/5b/f0/b5263e8668a4ee9becc2b451ed909e9c27058337fda5b8c49588183c267a/charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250", size = 119126 }, - { url = "https://files.pythonhosted.org/packages/ff/6e/e445afe4f7fda27a533f3234b627b3e515a1b9429bc981c9a5e2aa5d97b6/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920", size = 139342 }, - { url = "https://files.pythonhosted.org/packages/a1/b2/4af9993b532d93270538ad4926c8e37dc29f2111c36f9c629840c57cd9b3/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64", size = 149383 }, - { url = "https://files.pythonhosted.org/packages/fb/6f/4e78c3b97686b871db9be6f31d64e9264e889f8c9d7ab33c771f847f79b7/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23", size = 142214 }, - { url = "https://files.pythonhosted.org/packages/2b/c9/1c8fe3ce05d30c87eff498592c89015b19fade13df42850aafae09e94f35/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc", size = 144104 }, - { url = "https://files.pythonhosted.org/packages/ee/68/efad5dcb306bf37db7db338338e7bb8ebd8cf38ee5bbd5ceaaaa46f257e6/charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d", size = 146255 }, - { url = "https://files.pythonhosted.org/packages/0c/75/1ed813c3ffd200b1f3e71121c95da3f79e6d2a96120163443b3ad1057505/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88", size = 140251 }, - { url = "https://files.pythonhosted.org/packages/7d/0d/6f32255c1979653b448d3c709583557a4d24ff97ac4f3a5be156b2e6a210/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90", size = 148474 }, - { url = "https://files.pythonhosted.org/packages/ac/a0/c1b5298de4670d997101fef95b97ac440e8c8d8b4efa5a4d1ef44af82f0d/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b", size = 151849 }, - { url = "https://files.pythonhosted.org/packages/04/4f/b3961ba0c664989ba63e30595a3ed0875d6790ff26671e2aae2fdc28a399/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d", size = 149781 }, - { url = "https://files.pythonhosted.org/packages/d8/90/6af4cd042066a4adad58ae25648a12c09c879efa4849c705719ba1b23d8c/charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482", size = 144970 }, - { url = "https://files.pythonhosted.org/packages/cc/67/e5e7e0cbfefc4ca79025238b43cdf8a2037854195b37d6417f3d0895c4c2/charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67", size = 94973 }, - { url = "https://files.pythonhosted.org/packages/65/97/fc9bbc54ee13d33dc54a7fcf17b26368b18505500fc01e228c27b5222d80/charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b", size = 102308 }, - { url = "https://files.pythonhosted.org/packages/54/2f/28659eee7f5d003e0f5a3b572765bf76d6e0fe6601ab1f1b1dd4cba7e4f1/charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa", size = 196326 }, - { url = "https://files.pythonhosted.org/packages/d1/18/92869d5c0057baa973a3ee2af71573be7b084b3c3d428fe6463ce71167f8/charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a", size = 125614 }, - { url = "https://files.pythonhosted.org/packages/d6/27/327904c5a54a7796bb9f36810ec4173d2df5d88b401d2b95ef53111d214e/charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0", size = 120450 }, - { url = "https://files.pythonhosted.org/packages/a4/23/65af317914a0308495133b2d654cf67b11bbd6ca16637c4e8a38f80a5a69/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a", size = 140135 }, - { url = "https://files.pythonhosted.org/packages/f2/41/6190102ad521a8aa888519bb014a74251ac4586cde9b38e790901684f9ab/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242", size = 150413 }, - { url = "https://files.pythonhosted.org/packages/7b/ab/f47b0159a69eab9bd915591106859f49670c75f9a19082505ff16f50efc0/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b", size = 142992 }, - { url = "https://files.pythonhosted.org/packages/28/89/60f51ad71f63aaaa7e51a2a2ad37919985a341a1d267070f212cdf6c2d22/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62", size = 144871 }, - { url = "https://files.pythonhosted.org/packages/0c/48/0050550275fea585a6e24460b42465020b53375017d8596c96be57bfabca/charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0", size = 146756 }, - { url = "https://files.pythonhosted.org/packages/dc/b5/47f8ee91455946f745e6c9ddbb0f8f50314d2416dd922b213e7d5551ad09/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd", size = 141034 }, - { url = "https://files.pythonhosted.org/packages/84/79/5c731059ebab43e80bf61fa51666b9b18167974b82004f18c76378ed31a3/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be", size = 149434 }, - { url = "https://files.pythonhosted.org/packages/ca/f3/0719cd09fc4dc42066f239cb3c48ced17fc3316afca3e2a30a4756fe49ab/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d", size = 152443 }, - { url = "https://files.pythonhosted.org/packages/f7/0e/c6357297f1157c8e8227ff337e93fd0a90e498e3d6ab96b2782204ecae48/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3", size = 150294 }, - { url = "https://files.pythonhosted.org/packages/54/9a/acfa96dc4ea8c928040b15822b59d0863d6e1757fba8bd7de3dc4f761c13/charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742", size = 145314 }, - { url = "https://files.pythonhosted.org/packages/73/1c/b10a63032eaebb8d7bcb8544f12f063f41f5f463778ac61da15d9985e8b6/charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2", size = 94724 }, - { url = "https://files.pythonhosted.org/packages/c5/77/3a78bf28bfaa0863f9cfef278dbeadf55efe064eafff8c7c424ae3c4c1bf/charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca", size = 102159 }, - { url = "https://files.pythonhosted.org/packages/bf/9b/08c0432272d77b04803958a4598a51e2a4b51c06640af8b8f0f908c18bf2/charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079", size = 49446 }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, -] - -[[package]] -name = "docutils" -version = "0.21.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, -] - -[[package]] -name = "exceptiongroup" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, -] - -[[package]] -name = "ftfy" -version = "6.3.1" -source = { editable = "." } -dependencies = [ - { name = "wcwidth" }, -] - -[package.dev-dependencies] -dev = [ - { name = "furo" }, - { name = "pytest" }, - { name = "ruff" }, - { name = "sphinx" }, -] - -[package.metadata] -requires-dist = [{ name = "wcwidth" }] - -[package.metadata.requires-dev] -dev = [ - { name = "furo", specifier = ">=2024.7.18" }, - { name = "pytest", specifier = ">=8.3.2,<9" }, - { name = "ruff" }, - { name = "sphinx", specifier = ">=7,<8" }, -] - -[[package]] -name = "furo" -version = "2024.8.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "beautifulsoup4" }, - { name = "pygments" }, - { name = "sphinx" }, - { name = "sphinx-basic-ng" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a0/e2/d351d69a9a9e4badb4a5be062c2d0e87bd9e6c23b5e57337fef14bef34c8/furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01", size = 1661506 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/48/e791a7ed487dbb9729ef32bb5d1af16693d8925f4366befef54119b2e576/furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c", size = 341333 }, -] - -[[package]] -name = "idna" -version = "3.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, -] - -[[package]] -name = "imagesize" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769 }, -] - -[[package]] -name = "importlib-metadata" -version = "8.5.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, -] - -[[package]] -name = "iniconfig" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, -] - -[[package]] -name = "jinja2" -version = "3.1.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/55/39036716d19cab0747a5020fc7e907f362fbf48c984b14e62127f7e68e5d/jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369", size = 240245 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d", size = 133271 }, -] - -[[package]] -name = "markupsafe" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b4/d2/38ff920762f2247c3af5cbbbbc40756f575d9692d381d7c520f45deb9b8f/markupsafe-3.0.1.tar.gz", hash = "sha256:3e683ee4f5d0fa2dde4db77ed8dd8a876686e3fc417655c2ece9a90576905344", size = 20249 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/a2/0482d1a157f5f10f72fc4fe8c3be9ffa3651c1f7a12b60a3ab71b2635e13/MarkupSafe-3.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1", size = 14391 }, - { url = "https://files.pythonhosted.org/packages/3b/25/5ea6500d200fd2dc3ea25c765f69dea0a1a8d42ec80a38cd896ad47cb85d/MarkupSafe-3.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a", size = 12414 }, - { url = "https://files.pythonhosted.org/packages/92/41/cf5397dd6bb18895d148aa402cafa71018f2ffc5f6e9d6e90d85b523c741/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67c519635a4f64e495c50e3107d9b4075aec33634272b5db1cde839e07367589", size = 21787 }, - { url = "https://files.pythonhosted.org/packages/2e/0d/5d91ef2b4f30afa87483a3a7c108c777d144b1c42d7113459296a8a2bfa0/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48488d999ed50ba8d38c581d67e496f955821dc183883550a6fbc7f1aefdc170", size = 20954 }, - { url = "https://files.pythonhosted.org/packages/f6/de/12a4110c2c7c7b502fe0e6f911367726dbb7a37e03e207495135d064bb48/MarkupSafe-3.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f31ae06f1328595d762c9a2bf29dafd8621c7d3adc130cbb46278079758779ca", size = 21086 }, - { url = "https://files.pythonhosted.org/packages/96/55/59389babc6e8ed206849a9958de9da7c23f3a75d294f46e99624fa38fb79/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80fcbf3add8790caddfab6764bde258b5d09aefbe9169c183f88a7410f0f6dea", size = 21685 }, - { url = "https://files.pythonhosted.org/packages/3d/cb/cbad5f093e12cd79ceea3e2957ba5bd4c2706810f333d0a3422ab2aef358/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3341c043c37d78cc5ae6e3e305e988532b072329639007fd408a476642a89fd6", size = 21348 }, - { url = "https://files.pythonhosted.org/packages/8e/70/e19c4f39d68a52406012ee118667b57efb0bbe6e950be21187cd7a1b4b80/MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cb53e2a99df28eee3b5f4fea166020d3ef9116fdc5764bc5117486e6d1211b25", size = 21098 }, - { url = "https://files.pythonhosted.org/packages/30/95/ca809c01624428d427e9b3a4500f9068eca941e0c520328954ce84ad966a/MarkupSafe-3.0.1-cp310-cp310-win32.whl", hash = "sha256:db15ce28e1e127a0013dfb8ac243a8e392db8c61eae113337536edb28bdc1f97", size = 15075 }, - { url = "https://files.pythonhosted.org/packages/23/41/decb99ab07793656821a86f827a394700ce28402ebb02dc6d003210d9859/MarkupSafe-3.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:4ffaaac913c3f7345579db4f33b0020db693f302ca5137f106060316761beea9", size = 15535 }, - { url = "https://files.pythonhosted.org/packages/ce/af/2f5d88a7fc7226bd34c6e15f6061246ad8cff979da9f19d11bdd0addd8e2/MarkupSafe-3.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:26627785a54a947f6d7336ce5963569b5d75614619e75193bdb4e06e21d447ad", size = 14387 }, - { url = "https://files.pythonhosted.org/packages/8d/43/fd588ef5d192308c5e05974bac659bf6ae29c202b7ea2c4194bcf01eacee/MarkupSafe-3.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b954093679d5750495725ea6f88409946d69cfb25ea7b4c846eef5044194f583", size = 12410 }, - { url = "https://files.pythonhosted.org/packages/58/26/78f161d602fb03804118905e5faacafc0ec592bbad71aaee62537529813a/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:973a371a55ce9ed333a3a0f8e0bcfae9e0d637711534bcb11e130af2ab9334e7", size = 24006 }, - { url = "https://files.pythonhosted.org/packages/ae/1d/7d5ec8bcfd9c2db235d720fa51d818b7e2abc45250ce5f53dd6cb60409ca/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:244dbe463d5fb6d7ce161301a03a6fe744dac9072328ba9fc82289238582697b", size = 23303 }, - { url = "https://files.pythonhosted.org/packages/26/ce/703ca3b03a709e3bd1fbffa407789e56b9fa664456538092617dd665fc1d/MarkupSafe-3.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d98e66a24497637dd31ccab090b34392dddb1f2f811c4b4cd80c230205c074a3", size = 23205 }, - { url = "https://files.pythonhosted.org/packages/88/60/40be0493decabc2344b12d3a709fd6ccdd15a5ebaee1e8d878315d107ad3/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ad91738f14eb8da0ff82f2acd0098b6257621410dcbd4df20aaa5b4233d75a50", size = 23684 }, - { url = "https://files.pythonhosted.org/packages/6d/f8/8fd52a66e8f62a9add62b4a0b5a3ab4092027437f2ef027f812d94ae91cf/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7044312a928a66a4c2a22644147bc61a199c1709712069a344a3fb5cfcf16915", size = 23472 }, - { url = "https://files.pythonhosted.org/packages/d4/0b/998b17b9e06ea45ad1646fea586f1b83d02dfdb14d47dd2fd81fba5a08c9/MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a4792d3b3a6dfafefdf8e937f14906a51bd27025a36f4b188728a73382231d91", size = 23388 }, - { url = "https://files.pythonhosted.org/packages/5a/57/b6b7aa23b2e26d68d601718f8ce3161fbdaf967b31752c7dec52bef828c9/MarkupSafe-3.0.1-cp311-cp311-win32.whl", hash = "sha256:fa7d686ed9883f3d664d39d5a8e74d3c5f63e603c2e3ff0abcba23eac6542635", size = 15106 }, - { url = "https://files.pythonhosted.org/packages/fc/b5/20cb1d714596acb553c810009c8004c809823947da63e13c19a7decfcb6c/MarkupSafe-3.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:9ba25a71ebf05b9bb0e2ae99f8bc08a07ee8e98c612175087112656ca0f5c8bf", size = 15542 }, - { url = "https://files.pythonhosted.org/packages/45/6d/72ed58d42a12bd9fc288dbff6dd8d03ea973a232ac0538d7f88d105b5251/MarkupSafe-3.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8ae369e84466aa70f3154ee23c1451fda10a8ee1b63923ce76667e3077f2b0c4", size = 14322 }, - { url = "https://files.pythonhosted.org/packages/86/f5/241238f89cdd6461ac9f521af8389f9a48fab97e4f315c69e9e0d52bc919/MarkupSafe-3.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40f1e10d51c92859765522cbd79c5c8989f40f0419614bcdc5015e7b6bf97fc5", size = 12380 }, - { url = "https://files.pythonhosted.org/packages/27/94/79751928bca5841416d8ca02e22198672e021d5c7120338e2a6e3771f8fc/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a4cb365cb49b750bdb60b846b0c0bc49ed62e59a76635095a179d440540c346", size = 24099 }, - { url = "https://files.pythonhosted.org/packages/10/6e/1b8070bbfc467429c7983cd5ffd4ec57e1d501763d974c7caaa0a9a79f4c/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee3941769bd2522fe39222206f6dd97ae83c442a94c90f2b7a25d847d40f4729", size = 23249 }, - { url = "https://files.pythonhosted.org/packages/66/50/9389ae6cdff78d7481a2a2641830b5eb1d1f62177550e73355a810a889c9/MarkupSafe-3.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62fada2c942702ef8952754abfc1a9f7658a4d5460fabe95ac7ec2cbe0d02abc", size = 23149 }, - { url = "https://files.pythonhosted.org/packages/16/02/5dddff5366fde47133186efb847fa88bddef85914bbe623e25cfeccb3517/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c2d64fdba74ad16138300815cfdc6ab2f4647e23ced81f59e940d7d4a1469d9", size = 23864 }, - { url = "https://files.pythonhosted.org/packages/f3/f1/700ee6655561cfda986e03f7afc309e3738918551afa7dedd99225586227/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fb532dd9900381d2e8f48172ddc5a59db4c445a11b9fab40b3b786da40d3b56b", size = 23440 }, - { url = "https://files.pythonhosted.org/packages/fb/3e/d26623ac7f16709823b4c80e0b4a1c9196eeb46182a6c1d47b5e0c8434f4/MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0f84af7e813784feb4d5e4ff7db633aba6c8ca64a833f61d8e4eade234ef0c38", size = 23610 }, - { url = "https://files.pythonhosted.org/packages/51/04/1f8da0810c39cb9fcff96b6baed62272c97065e9cf11471965a161439e20/MarkupSafe-3.0.1-cp312-cp312-win32.whl", hash = "sha256:cbf445eb5628981a80f54087f9acdbf84f9b7d862756110d172993b9a5ae81aa", size = 15113 }, - { url = "https://files.pythonhosted.org/packages/eb/24/a36dc37365bdd358b1e583cc40475593e36ab02cb7da6b3d0b9c05b0da7a/MarkupSafe-3.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a10860e00ded1dd0a65b83e717af28845bb7bd16d8ace40fe5531491de76b79f", size = 15611 }, - { url = "https://files.pythonhosted.org/packages/b1/60/4572a8aa1beccbc24b133aa0670781a5d2697f4fa3fecf0a87b46383174b/MarkupSafe-3.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e81c52638315ff4ac1b533d427f50bc0afc746deb949210bc85f05d4f15fd772", size = 14325 }, - { url = "https://files.pythonhosted.org/packages/38/42/849915b99a765ec104bfd07ee933de5fc9c58fa9570efa7db81717f495d8/MarkupSafe-3.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:312387403cd40699ab91d50735ea7a507b788091c416dd007eac54434aee51da", size = 12373 }, - { url = "https://files.pythonhosted.org/packages/ef/82/4caaebd963c6d60b28e4445f38841d24f8b49bc10594a09956c9d73bfc08/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ae99f31f47d849758a687102afdd05bd3d3ff7dbab0a8f1587981b58a76152a", size = 24059 }, - { url = "https://files.pythonhosted.org/packages/20/15/6b319be2f79fcfa3173f479d69f4e950b5c9b642db4f22cf73ae5ade745f/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c97ff7fedf56d86bae92fa0a646ce1a0ec7509a7578e1ed238731ba13aabcd1c", size = 23211 }, - { url = "https://files.pythonhosted.org/packages/9d/3f/8963bdf4962feb2154475acb7dc350f04217b5e0be7763a39b432291e229/MarkupSafe-3.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7420ceda262dbb4b8d839a4ec63d61c261e4e77677ed7c66c99f4e7cb5030dd", size = 23095 }, - { url = "https://files.pythonhosted.org/packages/af/93/f770bc70953d32de0c6ce4bcb76271512123a1ead91aaef625a020c5bfaf/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45d42d132cff577c92bfba536aefcfea7e26efb975bd455db4e6602f5c9f45e7", size = 23901 }, - { url = "https://files.pythonhosted.org/packages/11/92/1e5a33aa0a1190161238628fb68eb1bc5e67b56a5c89f0636328704b463a/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c8817557d0de9349109acb38b9dd570b03cc5014e8aabf1cbddc6e81005becd", size = 23463 }, - { url = "https://files.pythonhosted.org/packages/0d/fe/657efdfe385d2a3a701f2c4fcc9577c63c438aeefdd642d0d956c4ecd225/MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a54c43d3ec4cf2a39f4387ad044221c66a376e58c0d0e971d47c475ba79c6b5", size = 23569 }, - { url = "https://files.pythonhosted.org/packages/cf/24/587dea40304046ace60f846cedaebc0d33d967a3ce46c11395a10e7a78ba/MarkupSafe-3.0.1-cp313-cp313-win32.whl", hash = "sha256:c91b394f7601438ff79a4b93d16be92f216adb57d813a78be4446fe0f6bc2d8c", size = 15117 }, - { url = "https://files.pythonhosted.org/packages/32/8f/d8961d633f26a011b4fe054f3bfff52f673423b8c431553268741dfb089e/MarkupSafe-3.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe32482b37b4b00c7a52a07211b479653b7fe4f22b2e481b9a9b099d8a430f2f", size = 15613 }, - { url = "https://files.pythonhosted.org/packages/9e/93/d6367ffbcd0c5c371370767f768eaa32af60bc411245b8517e383c6a2b12/MarkupSafe-3.0.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:17b2aea42a7280db02ac644db1d634ad47dcc96faf38ab304fe26ba2680d359a", size = 14563 }, - { url = "https://files.pythonhosted.org/packages/4a/37/f813c3835747dec08fe19ac9b9eced01fdf93a4b3e626521675dc7f423a9/MarkupSafe-3.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:852dc840f6d7c985603e60b5deaae1d89c56cb038b577f6b5b8c808c97580f1d", size = 12505 }, - { url = "https://files.pythonhosted.org/packages/72/bf/800b4d1580298ca91ccd6c95915bbd147142dad1b8cf91d57b93b28670dd/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0778de17cff1acaeccc3ff30cd99a3fd5c50fc58ad3d6c0e0c4c58092b859396", size = 25358 }, - { url = "https://files.pythonhosted.org/packages/fd/78/26e209abc8f0a379f031f0acc151231974e5b153d7eda5759d17d8f329f2/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:800100d45176652ded796134277ecb13640c1a537cad3b8b53da45aa96330453", size = 23797 }, - { url = "https://files.pythonhosted.org/packages/09/e1/918496a9390891756efee818880e71c1bbaf587f4dc8ede3f3852357310a/MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d06b24c686a34c86c8c1fba923181eae6b10565e4d80bdd7bc1c8e2f11247aa4", size = 23743 }, - { url = "https://files.pythonhosted.org/packages/cd/c6/26f576cd58d6c2decd9045e4e3f3c5dbc01ea6cb710916e7bbb6ebd95b6b/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:33d1c36b90e570ba7785dacd1faaf091203d9942bc036118fab8110a401eb1a8", size = 25076 }, - { url = "https://files.pythonhosted.org/packages/b5/fa/10b24fb3b0e15fe5389dc88ecc6226ede08297e0ba7130610efbe0cdfb27/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:beeebf760a9c1f4c07ef6a53465e8cfa776ea6a2021eda0d0417ec41043fe984", size = 24037 }, - { url = "https://files.pythonhosted.org/packages/c8/81/4b3f5537d9f6cc4f5c80d6c4b78af9a5247fd37b5aba95807b2cbc336b9a/MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bbde71a705f8e9e4c3e9e33db69341d040c827c7afa6789b14c6e16776074f5a", size = 24015 }, - { url = "https://files.pythonhosted.org/packages/5f/07/8e8dcecd53216c5e01a51e84c32a2bce166690ed19c184774b38cd41921d/MarkupSafe-3.0.1-cp313-cp313t-win32.whl", hash = "sha256:82b5dba6eb1bcc29cc305a18a3c5365d2af06ee71b123216416f7e20d2a84e5b", size = 15213 }, - { url = "https://files.pythonhosted.org/packages/0d/87/4c364e0f109eea2402079abecbe33fef4f347b551a11423d1f4e187ea497/MarkupSafe-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:730d86af59e0e43ce277bb83970530dd223bf7f2a838e086b50affa6ec5f9295", size = 15741 }, - { url = "https://files.pythonhosted.org/packages/6f/4f/420741fb39fa3d40396fb1731a1ca78e6f9fbb225dcf15e5185b1fa954bc/MarkupSafe-3.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4935dd7883f1d50e2ffecca0aa33dc1946a94c8f3fdafb8df5c330e48f71b132", size = 14376 }, - { url = "https://files.pythonhosted.org/packages/91/71/0c4782b9ce7fb68b140b94e1eb9d2b6292990bda91dc3d3b5a34e8bd41f3/MarkupSafe-3.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e9393357f19954248b00bed7c56f29a25c930593a77630c719653d51e7669c2a", size = 12408 }, - { url = "https://files.pythonhosted.org/packages/3e/3c/cbf30bf7ac1da2e013e3d338e1582db85fc3b27bf9f8863137423ad4b0b6/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40621d60d0e58aa573b68ac5e2d6b20d44392878e0bfc159012a5787c4e35bc8", size = 21654 }, - { url = "https://files.pythonhosted.org/packages/0b/28/229e797b8727427845b79cbd58019f598e478f974730fa705fa23904b18e/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f94190df587738280d544971500b9cafc9b950d32efcb1fba9ac10d84e6aa4e6", size = 20817 }, - { url = "https://files.pythonhosted.org/packages/e8/b4/1121f3b2614de93cbb3deec7f44df283df44c2258ea9368bb1302b4a0b45/MarkupSafe-3.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6a387d61fe41cdf7ea95b38e9af11cfb1a63499af2759444b99185c4ab33f5b", size = 20956 }, - { url = "https://files.pythonhosted.org/packages/a8/8b/b4d57bafca01c8b1e1fbb037660869fa4f6725983c4105a02bd1242f0066/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8ad4ad1429cd4f315f32ef263c1342166695fad76c100c5d979c45d5570ed58b", size = 21548 }, - { url = "https://files.pythonhosted.org/packages/83/87/04806f7096ba1d4f1b8c61f35c1d7c0b507c6a3cf7ed495393bf97eb5af7/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e24bfe89c6ac4c31792793ad9f861b8f6dc4546ac6dc8f1c9083c7c4f2b335cd", size = 21222 }, - { url = "https://files.pythonhosted.org/packages/e9/96/1ecb2bb5ee7298e628cff95833beba7da6a774df7fe890a6d2f0ec460590/MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2a4b34a8d14649315c4bc26bbfa352663eb51d146e35eef231dd739d54a5430a", size = 20952 }, - { url = "https://files.pythonhosted.org/packages/fd/70/b937a12df7bbff14e1ca3385929f464c7af2ca72c8183c95dad26c3bf754/MarkupSafe-3.0.1-cp39-cp39-win32.whl", hash = "sha256:242d6860f1fd9191aef5fae22b51c5c19767f93fb9ead4d21924e0bcb17619d8", size = 15075 }, - { url = "https://files.pythonhosted.org/packages/e3/c4/262fac0328552da9a75a7786d7c0f43adaba4afb5f295979d33fa0f324c7/MarkupSafe-3.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:93e8248d650e7e9d49e8251f883eed60ecbc0e8ffd6349e18550925e31bd029b", size = 15527 }, -] - -[[package]] -name = "packaging" -version = "24.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/51/65/50db4dda066951078f0a96cf12f4b9ada6e4b811516bf0262c0f4f7064d4/packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002", size = 148788 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", size = 53985 }, -] - -[[package]] -name = "pluggy" -version = "1.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, -] - -[[package]] -name = "pygments" -version = "2.18.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", size = 4891905 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 }, -] - -[[package]] -name = "pytest" -version = "8.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8b/6c/62bbd536103af674e227c41a8f3dcd022d591f6eed5facb5a0f31ee33bbc/pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181", size = 1442487 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, -] - -[[package]] -name = "requests" -version = "2.32.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "idna" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, -] - -[[package]] -name = "ruff" -version = "0.6.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/26/0d/6148a48dab5662ca1d5a93b7c0d13c03abd3cc7e2f35db08410e47cef15d/ruff-0.6.9.tar.gz", hash = "sha256:b076ef717a8e5bc819514ee1d602bbdca5b4420ae13a9cf61a0c0a4f53a2baa2", size = 3095355 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/8f/f7a0a0ef1818662efb32ed6df16078c95da7a0a3248d64c2410c1e27799f/ruff-0.6.9-py3-none-linux_armv6l.whl", hash = "sha256:064df58d84ccc0ac0fcd63bc3090b251d90e2a372558c0f057c3f75ed73e1ccd", size = 10440526 }, - { url = "https://files.pythonhosted.org/packages/8b/69/b179a5faf936a9e2ab45bb412a668e4661eded964ccfa19d533f29463ef6/ruff-0.6.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:140d4b5c9f5fc7a7b074908a78ab8d384dd7f6510402267bc76c37195c02a7ec", size = 10034612 }, - { url = "https://files.pythonhosted.org/packages/c7/ef/fd1b4be979c579d191eeac37b5cfc0ec906de72c8bcd8595e2c81bb700c1/ruff-0.6.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:53fd8ca5e82bdee8da7f506d7b03a261f24cd43d090ea9db9a1dc59d9313914c", size = 9706197 }, - { url = "https://files.pythonhosted.org/packages/29/61/b376d775deb5851cb48d893c568b511a6d3625ef2c129ad5698b64fb523c/ruff-0.6.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645d7d8761f915e48a00d4ecc3686969761df69fb561dd914a773c1a8266e14e", size = 10751855 }, - { url = "https://files.pythonhosted.org/packages/13/d7/def9e5f446d75b9a9c19b24231a3a658c075d79163b08582e56fa5dcfa38/ruff-0.6.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eae02b700763e3847595b9d2891488989cac00214da7f845f4bcf2989007d577", size = 10200889 }, - { url = "https://files.pythonhosted.org/packages/6c/d6/7f34160818bcb6e84ce293a5966cba368d9112ff0289b273fbb689046047/ruff-0.6.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d5ccc9e58112441de8ad4b29dcb7a86dc25c5f770e3c06a9d57e0e5eba48829", size = 11038678 }, - { url = "https://files.pythonhosted.org/packages/13/34/a40ff8ae62fb1b26fb8e6fa7e64bc0e0a834b47317880de22edd6bfb54fb/ruff-0.6.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:417b81aa1c9b60b2f8edc463c58363075412866ae4e2b9ab0f690dc1e87ac1b5", size = 11808682 }, - { url = "https://files.pythonhosted.org/packages/2e/6d/25a4386ae4009fc798bd10ba48c942d1b0b3e459b5403028f1214b6dd161/ruff-0.6.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c866b631f5fbce896a74a6e4383407ba7507b815ccc52bcedabb6810fdb3ef7", size = 11330446 }, - { url = "https://files.pythonhosted.org/packages/f7/f6/bdf891a9200d692c94ebcd06ae5a2fa5894e522f2c66c2a12dd5d8cb2654/ruff-0.6.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b118afbb3202f5911486ad52da86d1d52305b59e7ef2031cea3425142b97d6f", size = 12483048 }, - { url = "https://files.pythonhosted.org/packages/a7/86/96f4252f41840e325b3fa6c48297e661abb9f564bd7dcc0572398c8daa42/ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a67267654edc23c97335586774790cde402fb6bbdb3c2314f1fc087dee320bfa", size = 10936855 }, - { url = "https://files.pythonhosted.org/packages/45/87/801a52d26c8dbf73424238e9908b9ceac430d903c8ef35eab1b44fcfa2bd/ruff-0.6.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3ef0cc774b00fec123f635ce5c547dac263f6ee9fb9cc83437c5904183b55ceb", size = 10713007 }, - { url = "https://files.pythonhosted.org/packages/be/27/6f7161d90320a389695e32b6ebdbfbedde28ccbf52451e4b723d7ce744ad/ruff-0.6.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:12edd2af0c60fa61ff31cefb90aef4288ac4d372b4962c2864aeea3a1a2460c0", size = 10274594 }, - { url = "https://files.pythonhosted.org/packages/00/52/dc311775e7b5f5b19831563cb1572ecce63e62681bccc609867711fae317/ruff-0.6.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:55bb01caeaf3a60b2b2bba07308a02fca6ab56233302406ed5245180a05c5625", size = 10608024 }, - { url = "https://files.pythonhosted.org/packages/98/b6/be0a1ddcbac65a30c985cf7224c4fce786ba2c51e7efeb5178fe410ed3cf/ruff-0.6.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:925d26471fa24b0ce5a6cdfab1bb526fb4159952385f386bdcc643813d472039", size = 10982085 }, - { url = "https://files.pythonhosted.org/packages/bb/a4/c84bc13d0b573cf7bb7d17b16d6d29f84267c92d79b2f478d4ce322e8e72/ruff-0.6.9-py3-none-win32.whl", hash = "sha256:eb61ec9bdb2506cffd492e05ac40e5bc6284873aceb605503d8494180d6fc84d", size = 8522088 }, - { url = "https://files.pythonhosted.org/packages/74/be/fc352bd8ca40daae8740b54c1c3e905a7efe470d420a268cd62150248c91/ruff-0.6.9-py3-none-win_amd64.whl", hash = "sha256:785d31851c1ae91f45b3d8fe23b8ae4b5170089021fbb42402d811135f0b7117", size = 9359275 }, - { url = "https://files.pythonhosted.org/packages/3e/14/fd026bc74ded05e2351681545a5f626e78ef831f8edce064d61acd2e6ec7/ruff-0.6.9-py3-none-win_arm64.whl", hash = "sha256:a9641e31476d601f83cd602608739a0840e348bda93fec9f1ee816f8b6798b93", size = 8679879 }, -] - -[[package]] -name = "snowballstemmer" -version = "2.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/44/7b/af302bebf22c749c56c9c3e8ae13190b5b5db37a33d9068652e8f73b7089/snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", size = 86699 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/dc/c02e01294f7265e63a7315fe086dd1df7dacb9f840a804da846b96d01b96/snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a", size = 93002 }, -] - -[[package]] -name = "soupsieve" -version = "2.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, -] - -[[package]] -name = "sphinx" -version = "7.4.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "alabaster" }, - { name = "babel" }, - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "docutils" }, - { name = "imagesize" }, - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, - { name = "jinja2" }, - { name = "packaging" }, - { name = "pygments" }, - { name = "requests" }, - { name = "snowballstemmer" }, - { name = "sphinxcontrib-applehelp" }, - { name = "sphinxcontrib-devhelp" }, - { name = "sphinxcontrib-htmlhelp" }, - { name = "sphinxcontrib-jsmath" }, - { name = "sphinxcontrib-qthelp" }, - { name = "sphinxcontrib-serializinghtml" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/ef/153f6803c5d5f8917dbb7f7fcf6d34a871ede3296fa89c2c703f5f8a6c8e/sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239", size = 3401624 }, -] - -[[package]] -name = "sphinx-basic-ng" -version = "1.0.0b2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/98/0b/a866924ded68efec7a1759587a4e478aec7559d8165fac8b2ad1c0e774d6/sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9", size = 20736 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/dd/018ce05c532a22007ac58d4f45232514cd9d6dd0ee1dc374e309db830983/sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b", size = 22496 }, -] - -[[package]] -name = "sphinxcontrib-applehelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, -] - -[[package]] -name = "sphinxcontrib-devhelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, -] - -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, -] - -[[package]] -name = "sphinxcontrib-jsmath" -version = "1.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, -] - -[[package]] -name = "sphinxcontrib-qthelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, -] - -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, -] - -[[package]] -name = "tomli" -version = "2.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/b9/de2a5c0144d7d75a57ff355c0c24054f965b2dc3036456ae03a51ea6264b/tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed", size = 16096 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237 }, -] - -[[package]] -name = "urllib3" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, -] - -[[package]] -name = "wcwidth" -version = "0.2.13" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, -] - -[[package]] -name = "zipp" -version = "3.20.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, -]