From 24946ec4e3eb7757d88cdcb6afff03a149429e33 Mon Sep 17 00:00:00 2001 From: "Val Neekman (AvidCoder)" Date: Thu, 25 Jan 2024 11:24:52 -0500 Subject: [PATCH] pre normalize --- CHANGELOG.md | 5 +++-- slugify/__version__.py | 2 +- slugify/slugify.py | 9 ++++++--- test.py | 4 ++++ 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 395e538..015c5bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ ## Work in progress -- Added typing to API and expose `py.typed`. -- Formally support 3.12 +## 8.0.2 + +- Normalize text before converting to unicode. (chuckyblack - thx) ## 8.0.1 diff --git a/slugify/__version__.py b/slugify/__version__.py index a558d9b..dbbff9f 100644 --- a/slugify/__version__.py +++ b/slugify/__version__.py @@ -5,4 +5,4 @@ __url__ = 'https://github.com/un33k/python-slugify' __license__ = 'MIT' __copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.' -__version__ = '8.0.1' +__version__ = '8.0.2' diff --git a/slugify/slugify.py b/slugify/slugify.py index 21bdaeb..9242e3e 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -118,8 +118,11 @@ def slugify( # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) - # decode unicode - if not allow_unicode: + # normalize text, convert to unicode if required + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) text = unidecode.unidecode(text) # ensure text is still in unicode @@ -144,7 +147,7 @@ def slugify( except Exception: pass - # translate + # re normalize text if allow_unicode: text = unicodedata.normalize('NFKC', text) else: diff --git a/test.py b/test.py index 931f38f..995affa 100644 --- a/test.py +++ b/test.py @@ -36,6 +36,10 @@ def test_phonetic_conversion_of_eastern_scripts(self): self.assertEqual(r, "ying-shi-ma") def test_accented_text(self): + txt = '𝐚́́𝕒́àéé' + r = slugify(txt) + self.assertEqual(r, "aaaee") + txt = 'C\'est déjà l\'été.' r = slugify(txt) self.assertEqual(r, "c-est-deja-l-ete")