diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 64a986f2487d5c..9242a1ae3ea6ba 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -265,6 +265,15 @@ zipfile Optimizations ============= +re +-- + +* Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``) + outside a character set are now compiled to a single ``CATEGORY`` opcode + instead of being wrapped in an ``IN`` block. This speeds up matching of + patterns such as ``\d+`` and reduces the size of the compiled byte code. + (Contributed by Serhiy Storchaka in :gh:`152033`.) + module_name ----------- diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index c2ca8e25abe34d..304f875ea7fe7a 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -20,7 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} -_UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY} _REPEATING_CODES = { MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), @@ -495,6 +495,8 @@ def _get_charset_prefix(pattern, flags): if iscased and iscased(av): return None return [(op, av)] + elif op is CATEGORY: + return [(op, av)] elif op is BRANCH: charset = [] charsetappend = charset.append diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index bd189fe0695f80..3c41c43409534b 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -27,6 +27,7 @@ _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) +_SETITEMCODES = frozenset({LITERAL, CATEGORY}) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -43,12 +44,12 @@ r"\A": (AT, AT_BEGINNING_STRING), # start of string r"\b": (AT, AT_BOUNDARY), r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\d": (CATEGORY, CATEGORY_DIGIT), + r"\D": (CATEGORY, CATEGORY_NOT_DIGIT), + r"\s": (CATEGORY, CATEGORY_SPACE), + r"\S": (CATEGORY, CATEGORY_NOT_SPACE), + r"\w": (CATEGORY, CATEGORY_WORD), + r"\W": (CATEGORY, CATEGORY_NOT_WORD), r"\z": (AT, AT_END_STRING), # end of string r"\Z": (AT, AT_END_STRING), # end of string (obsolete) } @@ -315,7 +316,7 @@ def _class_escape(source, escape): if code: return code code = CATEGORIES.get(escape) - if code and code[0] is IN: + if code and code[0] is CATEGORY: return code try: c = escape[1:2] @@ -493,7 +494,7 @@ def _parse_sub(source, state, verbose, nested): if len(item) != 1: break op, av = item[0] - if op is LITERAL: + if op in _SETITEMCODES: set.append((op, av)) elif op is IN and av[0][0] is not NEGATE: set.extend(av) @@ -590,8 +591,6 @@ def _parse(source, state, verbose, nested, first=False): raise source.error("unterminated character set", source.tell() - here) if that == "]": - if code1[0] is IN: - code1 = code1[1][0] setappend(code1) setappend((LITERAL, _ord("-"))) break @@ -616,8 +615,6 @@ def _parse(source, state, verbose, nested, first=False): raise source.error(msg, len(this) + 1 + len(that)) setappend((RANGE, (lo, hi))) else: - if code1[0] is IN: - code1 = code1[1][0] setappend(code1) set = _uniq(set) diff --git a/Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst b/Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst new file mode 100644 index 00000000000000..7c54a6c16c1a14 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst @@ -0,0 +1,5 @@ +Optimize matching of character class escapes (``\d``, ``\D``, ``\s``, +``\S``, ``\w`` and ``\W``) that occur outside a character set: they are now +compiled to a single ``CATEGORY`` opcode instead of being wrapped in an +``IN`` block. This speeds up patterns such as ``\d+`` and reduces the size +of the compiled byte code. diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 32aa06bed4a409..9964532a7f401c 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -1842,6 +1842,34 @@ _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template) } while (0) #define GET_SKIP GET_SKIP_ADJ(0) +static int +_validate_category(SRE_CODE arg) +{ + switch (arg) { + case SRE_CATEGORY_DIGIT: + case SRE_CATEGORY_NOT_DIGIT: + case SRE_CATEGORY_SPACE: + case SRE_CATEGORY_NOT_SPACE: + case SRE_CATEGORY_WORD: + case SRE_CATEGORY_NOT_WORD: + case SRE_CATEGORY_LINEBREAK: + case SRE_CATEGORY_NOT_LINEBREAK: + case SRE_CATEGORY_LOC_WORD: + case SRE_CATEGORY_LOC_NOT_WORD: + case SRE_CATEGORY_UNI_DIGIT: + case SRE_CATEGORY_UNI_NOT_DIGIT: + case SRE_CATEGORY_UNI_SPACE: + case SRE_CATEGORY_UNI_NOT_SPACE: + case SRE_CATEGORY_UNI_WORD: + case SRE_CATEGORY_UNI_NOT_WORD: + case SRE_CATEGORY_UNI_LINEBREAK: + case SRE_CATEGORY_UNI_NOT_LINEBREAK: + return 1; + default: + return 0; + } +} + static int _validate_charset(SRE_CODE *code, SRE_CODE *end) { @@ -1894,27 +1922,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) case SRE_OP_CATEGORY: GET_ARG; - switch (arg) { - case SRE_CATEGORY_DIGIT: - case SRE_CATEGORY_NOT_DIGIT: - case SRE_CATEGORY_SPACE: - case SRE_CATEGORY_NOT_SPACE: - case SRE_CATEGORY_WORD: - case SRE_CATEGORY_NOT_WORD: - case SRE_CATEGORY_LINEBREAK: - case SRE_CATEGORY_NOT_LINEBREAK: - case SRE_CATEGORY_LOC_WORD: - case SRE_CATEGORY_LOC_NOT_WORD: - case SRE_CATEGORY_UNI_DIGIT: - case SRE_CATEGORY_UNI_NOT_DIGIT: - case SRE_CATEGORY_UNI_SPACE: - case SRE_CATEGORY_UNI_NOT_SPACE: - case SRE_CATEGORY_UNI_WORD: - case SRE_CATEGORY_UNI_NOT_WORD: - case SRE_CATEGORY_UNI_LINEBREAK: - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - break; - default: + if (!_validate_category(arg)) { FAIL; } break; @@ -1995,6 +2003,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) } break; + case SRE_OP_CATEGORY: + GET_ARG; + if (!_validate_category(arg)) { + FAIL; + } + break; + case SRE_OP_ANY: case SRE_OP_ANY_ALL: /* These have no operands */ diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index df377905bfae0d..6e6ae46f05a50f 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -193,6 +193,7 @@ LOCAL(Py_ssize_t) SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount) { SRE_CODE chr; + SRE_CODE arg; SRE_CHAR c; const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr; const SRE_CHAR* end = (const SRE_CHAR *)state->end; @@ -302,6 +303,13 @@ SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount) ptr++; break; + case SRE_OP_CATEGORY: + arg = pattern[1]; + TRACE(("|%p|%p|COUNT CATEGORY %d\n", pattern, ptr, arg)); + while (ptr < end && sre_category(arg, *ptr)) + ptr++; + break; + default: /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));