forked from JavaDevTeam/notes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdazhong.py
More file actions
52 lines (37 loc) · 1.26 KB
/
dazhong.py
File metadata and controls
52 lines (37 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import queue
from concurrent.futures import ThreadPoolExecutor
import threading
from urllib import request
from bs4 import BeautifulSoup
opener = request.build_opener()
url = 'https://www.dianping.com/search/keyword/9/0_%E8%8A%B1%E5%BA%97'
# 工作队列
work_queue = queue.Queue(maxsize=100)
# 总页码
total = None
# 线程池
exector = ThreadPoolExecutor()
def worker(item):
opener = request.build_opener()
with opener.open(item['href']) as rep:
html = BeautifulSoup(rep.read(),'html.parser')
print(html)
address = html.find_all('span',attr={'class':'item','itemprop':'street-address'})
print(address)
def start():
while True:
item = work_queue.get()
exector.submit(worker,item)
threading.Thread(target = start).start()
with opener.open(url) as rep:
html = BeautifulSoup(rep.read(),'html.parser')
page_div = html.find('div',class_='page')
# 获取总页码
total = int(page_div.find_all('a')[-2].string)
for i in range(1,total + 1):
with opener.open(url + '/p' + str(i)) as rep:
div = html.find_all('div',class_='tit')
for i in div:
href = i.find('a').get('href')
name = i.find_all('h4')[0].string
work_queue.put({'href':href,'name':name})