Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ._casefix import _EXTRA_CASES
from ._optimizer import (
_combine_flags, _compile_charset, _optimize_charset, _compile_info,
_simple, _CHARSET_ALL, _CODEBITS, MAXCODE,
_simple, _CHARSET_ALL, _CODEBITS, MAXCODE, optimize,
)

assert _sre.MAGIC == MAGIC, "SRE module mismatch"
Expand Down Expand Up @@ -219,6 +219,10 @@ def isstring(obj):
def _code(p, flags):

flags = p.state.flags | flags

# run the optimizer passes over the parsed pattern
optimize(p)

code = []

# compile info block
Expand Down
116 changes: 114 additions & 2 deletions Lib/re/_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,39 @@ def _compile_charset(charset, flags, code):
emit(FAILURE)

def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
# internal: optimize character set.
#
# The engine's charset() walk toggles polarity on every NEGATE (see
# Modules/_sre/sre_lib.h), so NEGATE markers split the set into
# alternating-polarity segments: a leading NEGATE is a complemented class
# [^...], an interior one is set difference (RL1.3). Each segment is a
# plain union, optimized on its own with the NEGATE boundaries kept in place.
negates = [i for i, (op, _av) in enumerate(charset) if op is NEGATE]
if not negates or negates == [0]:
# Fast path: a plain union, optionally complemented as a whole -- every
# charset the parser produces today, optimized as before.
return _optimize_charset_segment(charset, iscased, fixup, fixes)

# Optimize each NEGATE-delimited run on its own. _allow_anyall is off: the
# [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and
# would inject or drop a NEGATE mid-segment.
out = []
hascased = False
start = 0
for i in negates + [len(charset)]:
if i > start: # skip an empty run (e.g. a leading NEGATE)
opt, cased = _optimize_charset_segment(
charset[start:i], iscased, fixup, fixes, _allow_anyall=False)
out.extend(opt)
hascased |= cased
if i < len(charset):
out.append((NEGATE, None))
start = i + 1
return out, hascased

def _optimize_charset_segment(charset, iscased=None, fixup=None, fixes=None,
_allow_anyall=True):
# internal: optimize one NEGATE-free union of character-set members
out = []
tail = []
charmap = bytearray(256)
Expand Down Expand Up @@ -94,7 +126,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
elif op is CATEGORY and _allow_anyall and tail and (CATEGORY, CH_NEGATE[av]) in tail:
# Optimize [\s\S] etc.
out = [] if out else _CHARSET_ALL
return out, False
Expand Down Expand Up @@ -395,3 +427,83 @@ def _compile_info(code, pattern, flags):
elif charset:
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip

# Difference-fusion peephole: rewrite [A--B]-style A(?<![B]) into a single
# charset (see the engine's NEGATE polarity toggle).
def _subpatterns(op, av):
# Yield the nested SubPatterns of one item, to recurse into.
if op is BRANCH:
yield from av[1]
elif op in (ASSERT, ASSERT_NOT):
yield av[1]
elif op is SUBPATTERN:
yield av[3]
elif op is ATOMIC_GROUP:
yield av
elif op in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT):
yield av[2]
elif op is GROUPREF_EXISTS:
yield av[1] # the "yes" branch is always present
if av[2] is not None: # the "no" branch is optional
yield av[2]

def _fuse_branch(av):
# Fold a BRANCH of one-character matchers into a single charset: their union
# is the concatenation of the item lists. charset() lets only the final
# polarity segment subtract, so at most one alternative may be
# complement-bearing (carry a NEGATE) and it must trail; two would cross
# (e.g. [a-z--b]||[a-z--c]) and are left as a BRANCH.
items = []
tail = None
for sp in av[1]:
cs = _parser._flat_items(sp.data, True)
if cs is None:
return None
if any(op is NEGATE for op, _av in cs):
if tail is not None:
return None
tail = cs
else:
items += cs
return items if tail is None else items + tail

def _fuse_difference(data):
# Replace <flat charset A> (?<![B1]) (?<![B2]) ... with the single charset
# [NEGATE] B1 B2 ... [NEGATE] A. Each negative lookbehind over a flat
# charset subtracts its set from the character A matches.
out = []
head = None # _flat_items(A) for the fused difference now at out[-1]
subtrahend = None # its accumulated B items, or None when not fusing
for op, av in data:
if op is ASSERT_NOT and av[0] < 0: # a negative lookbehind
b = _parser._flat_items(av[1].data)
if b is not None:
if subtrahend is None and out:
# the first lookbehind of a run: only now is it worth
# checking whether the preceding item A is a flat charset.
head = _parser._flat_items([out[-1]])
if head is not None:
subtrahend = []
if subtrahend is not None:
subtrahend += b
out[-1] = (IN, [(NEGATE, None)] + subtrahend
+ [(NEGATE, None)] + head)
continue
head = subtrahend = None
out.append((op, av))
data[:] = out

def _walk(seq):
for i, (op, av) in enumerate(seq):
for sub in _subpatterns(op, av):
_walk(sub.data)
if op is BRANCH:
items = _fuse_branch(av)
if items is not None:
seq[i] = (IN, items)
_fuse_difference(seq)

def optimize(pattern):
"""Rewrite a parsed pattern in place and return it."""
_walk(pattern.data)
return pattern
15 changes: 11 additions & 4 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,14 +516,19 @@ def _charset_node(items):
return items[0]
return (IN, items)

def _flat_items(elements):
# The items if `elements` is a single flat charset (no complement), else
# None -- the dual of _charset_node: a lone LITERAL or CATEGORY is an item.
def _flat_items(elements, complement=False):
# The items if `elements` is a single flat charset, else None -- the dual
# of _charset_node: a lone LITERAL or CATEGORY is an item. A complemented
# charset (a NEGATE-bearing IN) qualifies only when `complement` is true.
if len(elements) == 1:
op, av = elements[0]
if op in _SETITEMCODES:
return [elements[0]]
if op is IN and all(o is not NEGATE for o, _av in av):
if op is IN:
if not complement:
for o, _av in av:
if o is NEGATE:
return None
return av
return None

Expand Down Expand Up @@ -677,6 +682,8 @@ def _parse_charset(source, state, nested):
# [A--B] -> A (?<![B]) difference
# [A&&B] -> A (?<=[B]) intersection
# [A||B] -> [AB] or (?:A|B) union
# A flat-operand difference [A--B] is later fused back into a single charset
# by Lib/re/_optimizer.py (see that module).
# Operators chain left-to-right with no precedence. A leading '^' negates by
# De Morgan, pushing the negation into the operands (no lookahead needed):
# [^A--B] -> [^A] | B ; [^A&&B] -> [^A] | [^B] ; [^A||B] -> [^A] && [^B]
Expand Down
Loading