forked from Boris-code/feapder
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebdriver.py
More file actions
235 lines (194 loc) · 6.94 KB
/
webdriver.py
File metadata and controls
235 lines (194 loc) · 6.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# -*- coding: utf-8 -*-
"""
Created on 2021/3/18 4:59 下午
---------
@summary:
---------
@author: Boris
@email: boris_liu@foxmail.com
"""
import queue
import threading
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from feapder.utils.log import log
from feapder.utils.tools import Singleton
class WebDriver:
CHROME = "CHROME"
PHANTOMJS = "PHANTOMJS"
def __init__(
self,
load_images=True,
user_agent=None,
proxy=None,
headless=False,
driver_type=PHANTOMJS,
timeout=16,
window_size=(1024, 800),
executable_path=None,
):
"""
@param load_images: 是否加载图片
@param user_agent_pool: user-agent池 为None时不使用
@param proxies_pool: ;代理池 为None时不使用
@param headless: 是否启用无头模式
@param driver_type: web driver 类型
@param user_agent: 字符串 或 返回user_agent的函数
@param proxy xxx.xxx.xxx.xxx:xxxx 或 返回代理的函数
@param timeout: 请求超时时间 默认16s
@param window_size: 屏幕分辨率 (width, height)
@param executable_path: 浏览器路径,默认为默认路径
"""
self._load_images = load_images
self._user_agent = (
user_agent
or " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
)
self._proxy = proxy
self._headless = headless
self._timeout = timeout
self._window_size = window_size
self._executable_path = executable_path
self.proxies = {}
self.user_agent = None
if driver_type == WebDriver.CHROME:
self.driver = self.chrome_driver()
elif driver_type == WebDriver.PHANTOMJS:
self.driver = self.phantomjs_driver()
else:
raise TypeError(
"dirver_type must be one of CHROME or PHANTOMJS, but received {}".format(
type(driver_type)
)
)
# driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
self.driver.set_page_load_timeout(self._timeout)
# 设置10秒脚本超时时间
self.driver.set_script_timeout(self._timeout)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_val:
log.error(exc_val)
self.quit()
return True
def get_driver(self):
return self.driver
def chrome_driver(self):
chrome_options = webdriver.ChromeOptions()
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
if self._proxy:
chrome_options.add_argument(
"--proxy-server={}".format(
self._proxy() if callable(self._proxy) else self._proxy
)
)
if self._user_agent:
chrome_options.add_argument(
"user-agent={}".format(
self._user_agent()
if callable(self._user_agent)
else self._user_agent
)
)
if not self._load_images:
chrome_options.add_experimental_option(
"prefs", {"profile.managed_default_content_settings.images": 2}
)
if self._headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
if self._window_size:
chrome_options.add_argument(
"--window-size={},{}".format(self._window_size[0], self._window_size[1])
)
if self._executable_path:
driver = webdriver.Chrome(
chrome_options=chrome_options, executable_path=self._executable_path
)
else:
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
},
)
return driver
def phantomjs_driver(self):
import warnings
warnings.filterwarnings("ignore")
service_args = []
dcap = DesiredCapabilities.PHANTOMJS
if self._proxy:
service_args.append(
"--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
)
if self._user_agent:
dcap["phantomjs.page.settings.userAgent"] = (
self._user_agent() if callable(self._user_agent) else self._user_agent
)
if not self._load_images:
service_args.append("--load-images=no")
if self._executable_path:
driver = webdriver.PhantomJS(
service_args=service_args,
desired_capabilities=dcap,
executable_path=self._executable_path,
)
else:
driver = webdriver.PhantomJS(
service_args=service_args, desired_capabilities=dcap
)
if self._window_size:
driver.set_window_size(self._window_size[0], self._window_size[1])
del warnings
return driver
@property
def cookies(self):
cookies_json = {}
for cookie in self.driver.get_cookies():
cookies_json[cookie["name"]] = cookie["value"]
return cookies_json
def __getattr__(self, name):
if self.driver:
return getattr(self.driver, name)
else:
raise AttributeError
# def __del__(self):
# self.quit()
@Singleton
class WebDriverPool:
def __init__(self, pool_size=5, **kwargs):
self.queue = queue.Queue(maxsize=pool_size)
self.kwargs = kwargs
self.lock = threading.RLock()
self.driver_count = 0
@property
def is_full(self):
return self.driver_count >= self.queue.maxsize
def get(self):
if not self.is_full:
with self.lock:
if not self.is_full:
driver = WebDriver(**self.kwargs)
self.queue.put(driver)
self.driver_count += 1
driver = self.queue.get()
return driver
def put(self, driver):
self.queue.put(driver)
def remove(self, driver):
driver.quit()
self.driver_count -= 1
def close(self):
while not self.queue.empty():
driver = self.queue.get()
driver.quit()
self.driver_count -= 1