forked from Boris-code/feapder
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselector.py
More file actions
155 lines (125 loc) · 5.53 KB
/
selector.py
File metadata and controls
155 lines (125 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
"""
Created on 2018-10-08 15:33:37
---------
@summary: 重新定义 selector
---------
@author: Boris
@email: boris_liu@foxmail.com
"""
import re
import six
from lxml import etree
from parsel import Selector as ParselSelector
from parsel import SelectorList as ParselSelectorList
from w3lib.html import replace_entities as w3lib_replace_entities
def extract_regex(regex, text, replace_entities=True, flags=0):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, flags=flags)
if "extract" in regex.groupindex:
# named group
try:
extracted = regex.search(text).group("extract")
except AttributeError:
strings = []
else:
strings = [extracted] if extracted is not None else []
else:
# full regex or numbered groups
strings = regex.findall(text)
# strings = flatten(strings) # 这东西会把多维列表铺平
if not replace_entities:
return strings
values = []
for value in strings:
if isinstance(value, (list, tuple)): # w3lib_replace_entities 不能接收list tuple
values.append(
[w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
)
else:
values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
return values
def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
root = etree.fromstring(body, parser=parser, base_url=base_url)
if root is None:
root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
return root
class SelectorList(ParselSelectorList):
"""
The :class:`SelectorList` class is a subclass of the builtin ``list``
class, which provides a few additional methods.
"""
def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
"""
Call the ``.re()`` method for the first element in this list and
return the result in an unicode string. If the list is empty or the
regex doesn't match anything, return the default value (``None`` if
the argument is not provided).
By default, character entity references are replaced by their
corresponding character (except for ``&`` and ``<``.
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
datas = self.re(regex, replace_entities=replace_entities, flags=flags)
return datas[0] if datas else default
def re(self, regex, replace_entities=True, flags=re.S):
"""
Call the ``.re()`` method for each element in this list and return
their results flattened, as a list of unicode strings.
By default, character entity references are replaced by their
corresponding character (except for ``&`` and ``<``.
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
datas = [
x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
]
return datas[0] if len(datas) == 1 else datas
class Selector(ParselSelector):
selectorlist_cls = SelectorList
def __str__(self):
data = repr(self.get())
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
__repr__ = __str__
def __init__(self, text=None, *args, **kwargs):
# 先将 转为空格,否则selector 会转为 \xa0
if text:
text = re.sub(" ", "\x20", text)
super(Selector, self).__init__(text, *args, **kwargs)
def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
"""
Apply the given regex and return the first unicode string which
matches. If there is no match, return the default value (``None`` if
the argument is not provided).
By default, character entity references are replaced by their
corresponding character (except for ``&`` and ``<``.
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
datas = self.re(regex, replace_entities=replace_entities, flags=flags)
return datas[0] if datas else default
def re(self, regex, replace_entities=True, flags=re.S):
"""
Apply the given regex and return a list of unicode strings with the
matches.
``regex`` can be either a compiled regular expression or a string which
will be compiled to a regular expression using ``re.compile(regex)``.
By default, character entity references are replaced by their
corresponding character (except for ``&`` and ``<``.
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
return extract_regex(
regex, self.get(), replace_entities=replace_entities, flags=flags
)
def _get_root(self, text, base_url=None):
return create_root_node(text, self._parser, base_url=base_url)