import re from itertools import count from parso.utils import PythonVersionInfo from parso.utils import split_lines from parso.python.tokenize import Token from parso import parser from parso.tree import TypedLeaf, ErrorNode, ErrorLeaf version36 = PythonVersionInfo(3, 6) class TokenNamespace: _c = count() LBRACE = next(_c) RBRACE = next(_c) ENDMARKER = next(_c) COLON = next(_c) CONVERSION = next(_c) PYTHON_EXPR = next(_c) EXCLAMATION_MARK = next(_c) UNTERMINATED_STRING = next(_c) token_map = dict((v, k) for k, v in locals().items() if not k.startswith('_')) @classmethod def generate_token_id(cls, string): if string == '{': return cls.LBRACE elif string == '}': return cls.RBRACE elif string == '!': return cls.EXCLAMATION_MARK elif string == ':': return cls.COLON return getattr(cls, string) GRAMMAR = """ fstring: expression* ENDMARKER format_spec: ':' expression* expression: '{' PYTHON_EXPR [ '!' CONVERSION ] [ format_spec ] '}' """ _prefix = r'((?:[^{}]+)*)' _expr = _prefix + r'(\{|\}|$)' _in_expr = r'([^{}\[\]:"\'!]*)(.?)' # There's only one conversion character allowed. But the rules have to be # checked later anyway, so allow more here. This makes error recovery nicer. _conversion = r'([^={}:]*)(.?)' _compiled_expr = re.compile(_expr) _compiled_in_expr = re.compile(_in_expr) _compiled_conversion = re.compile(_conversion) def tokenize(code, start_pos=(1, 0)): def add_to_pos(string): lines = split_lines(string) l = len(lines[-1]) if len(lines) > 1: start_pos[0] += len(lines) - 1 start_pos[1] = l else: start_pos[1] += l def tok(value, type=None, prefix=''): if type is None: type = TokenNamespace.generate_token_id(value) add_to_pos(prefix) token = Token(type, value, tuple(start_pos), prefix) add_to_pos(value) return token start = 0 recursion_level = 0 added_prefix = '' start_pos = list(start_pos) while True: match = _compiled_expr.match(code, start) prefix = added_prefix + match.group(1) found = match.group(2) start = match.end() if not found: # We're at the end. break if found == '}': if recursion_level == 0 and len(code) > start and code[start] == '}': # This is a }} escape. added_prefix = prefix + '}}' start += 1 continue recursion_level = max(0, recursion_level - 1) yield tok(found, prefix=prefix) added_prefix = '' else: assert found == '{' if recursion_level == 0 and len(code) > start and code[start] == '{': # This is a {{ escape. added_prefix = prefix + '{{' start += 1 continue recursion_level += 1 yield tok(found, prefix=prefix) added_prefix = '' expression = '' squared_count = 0 curly_count = 0 while True: expr_match = _compiled_in_expr.match(code, start) expression += expr_match.group(1) found = expr_match.group(2) start = expr_match.end() if found == '{': curly_count += 1 expression += found elif found == '}' and curly_count > 0: curly_count -= 1 expression += found elif found == '[': squared_count += 1 expression += found elif found == ']': # Use a max function here, because the Python code might # just have syntax errors. squared_count = max(0, squared_count - 1) expression += found elif found == ':' and (squared_count or curly_count): expression += found elif found in ('"', "'"): search = found if len(code) > start + 1 and \ code[start] == found == code[start+1]: search *= 3 start += 2 index = code.find(search, start) if index == -1: yield tok(expression, type=TokenNamespace.PYTHON_EXPR) yield tok( found + code[start:], type=TokenNamespace.UNTERMINATED_STRING, ) start = len(code) break expression += found + code[start:index+1] start = index + 1 elif found == '!' and len(code) > start and code[start] == '=': # This is a python `!=` and not a conversion. expression += found else: yield tok(expression, type=TokenNamespace.PYTHON_EXPR) if found: yield tok(found) break if found == '!': conversion_match = _compiled_conversion.match(code, start) found = conversion_match.group(2) start = conversion_match.end() yield tok(conversion_match.group(1), type=TokenNamespace.CONVERSION) if found: yield tok(found) if found == '}': recursion_level -= 1 # We don't need to handle everything after ':', because that is # basically new tokens. yield tok('', type=TokenNamespace.ENDMARKER, prefix=prefix) class Parser(parser.BaseParser): def parse(self, tokens): node = super(Parser, self).parse(tokens) if isinstance(node, self.default_leaf): # Is an endmarker. # If there's no curly braces we get back a non-module. We always # want an fstring. node = self.default_node('fstring', [node]) return node def convert_leaf(self, pgen_grammar, type, value, prefix, start_pos): # TODO this is so ugly. leaf_type = TokenNamespace.token_map[type].lower() return TypedLeaf(leaf_type, value, start_pos, prefix) def error_recovery(self, pgen_grammar, stack, arcs, typ, value, start_pos, prefix, add_token_callback): if not self._error_recovery: return super(Parser, self).error_recovery( pgen_grammar, stack, arcs, typ, value, start_pos, prefix, add_token_callback ) token_type = TokenNamespace.token_map[typ].lower() if len(stack) == 1: error_leaf = ErrorLeaf(token_type, value, start_pos, prefix) stack[0][2][1].append(error_leaf) else: dfa, state, (type_, nodes) = stack[1] stack[0][2][1].append(ErrorNode(nodes)) stack[1:] = [] add_token_callback(typ, value, start_pos, prefix)