forked from netnr/zoning
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzoning.js
More file actions
310 lines (295 loc) · 9.71 KB
/
zoning.js
File metadata and controls
310 lines (295 loc) · 9.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/*
* https://github.com/netnr/zoning
* https://gitee.com/netnr/zoning
*
* 2019-04-26
* netnr
*
*
* 文件:
* 0.json 根数据
* 12.json 二级数据
* 1234.json 三级数据
* 123456.json 四级数据
* 123456789.json 五级数据
*
* 其他:
* zoning-*.json 所有数据,* 代表级数
* catch-*.json 抓取异常记录(有异常时)
*
* 测试:
* Chrome比较快,会出现几个链接抓取失败;
* Firefox比较稳定,抓取有保障,内存占用高(推荐)
*
*/
var zoning = {
//版本号
version: "2.0.18",
//载入js脚本
getScript: function (src, success) {
var ele = document.createElement("SCRIPT");
ele.src = src;
ele.type = "text/javascript";
document.getElementsByTagName("HEAD")[0].appendChild(ele);
//加载完成回调
if (success != undefined) {
ele.onload = ele.onreadystatechange = function () {
if (!this.readyState || this.readyState == "loaded" || this.readyState == "complete") { success(); }
}
}
},
//fetch 抓取
grab: function (urlprefix, deep, item) {
if (item.path == null) {
return false;
}
var url = urlprefix;
switch (deep) {
case 4:
url += item.id.substr(0, 2) + "/";
break;
case 5:
url += item.id.substr(0, 2) + "/" + item.id.substr(2, 2) + "/";
break;
}
url += item.path + ".html";
//总数
zoning.taskcount++;
//fetch 抓取 gb2312
fetch(url).then(res => res.blob()).then(blob => {
var reader = new FileReader();
reader.onload = function () {
var list = zoning.matcharray(reader.result, item, deep, url);
//过滤
if (zoning.config.fetchcode.length) {
var fetchlist = list.filter(x => zoning.config.fetchcode.indexOf(x.id) > -1);
if (fetchlist.length) {
list = fetchlist;
}
}
if (list.length > 0 && deep < zoning.config.deepmax) {
for (var i = 0; i < list.length; i++) {
var li = list[i];
zoning.queue.add({
urlprefix: urlprefix,
deep: deep + 1,
item: li
});
}
}
}
reader.readAsText(blob, 'GBK');
}).catch(function (e) {
var obj = {};
obj.item = item;
obj.url = url;
obj.path = item.path;
obj.deep = deep;
obj.error = e + "";
zoning.catchdata.push(obj);
});
},
//异常记录重新抓取
grabcatch: function (catchdata) {
for (var i = 0; i < catchdata.length; i++) {
var cdi = catchdata[i];
cdi.item.path = cdi.path;
zoning.grab(zoning.config.urlprefix, cdi.deep, cdi.item);
}
},
//任务队列
queue: {
//列表
list: [],
//新增
add: function (task) {
zoning.queue.list.push(task);
},
//消费
use: function (n) {
var len = zoning.queue.list.length;
if (len) {
return zoning.queue.list.splice(0, Math.min(n || 1, len));
}
return null;
},
//运行
run: function () {
clearInterval(zoning.taskdefer.queuerun);
zoning.taskdefer.queuerun = setInterval(function () {
//暂停
if (!zoning.pause) {
var task = zoning.queue.use();
if (task) {
task.forEach(x => zoning.grab(x.urlprefix, x.deep, x.item));
}
}
}, zoning.config.gap);
}
},
//任务延时记录
taskdefer: {},
//任务总量
taskcount: 0,
//抓取数量
matchcount: 0,
//抓取异常记录
catchdata: [],
//抓取结果数据
matchdata: {},
//匹配抓取内容
matcharray: function (data, item, deep) {
var arr = [];
//替换单引号为双引号、清除br标签
data = data.replace(/'/g, '"').replace(/<br\/>/g, "");
//匹配所有的A标签
var reg = /<a[^>]*href=['"]([^"]*)['"][^>]*>(.*?)<\/a>/g;
var matchs = data.match(reg), clen;
if (matchs && deep > 1) {
//记录当前id最大长度
clen = matchs[0].split('.')[0].split('/')[1].length;
}
var currid = [];
//匹配 最后末级无链接 项
data.replace(/<td>[0-9]{12}<\/td><td>[0-9]{3}<\/td><td>.*?<\/td>/g, function (x) {
var mat = x.split('</td><td>');
var obj = {};
obj.path = null;
obj.id = mat[0].split('>')[1];
obj.text = mat[2].split('<')[0];
arr.push(obj);
currid.push(obj.id);
});
//匹配 市辖区 无链接 项
data.replace(/<td>[0-9]{12}<\/td><td>.*?<\/td>/g, function (x) {
var mat = x.split('</td><td>');
var obj = {};
obj.path = null;
obj.id = mat[0].split('>')[1];
if (clen) {
obj.id = obj.id.substr(0, clen);
}
obj.text = mat[1].split('<')[0];
if (currid.indexOf(obj.id) == -1) {
arr.push(obj);
}
});
//有A标签
if (matchs) {
for (var i = 0; i < matchs.length; i++) {
var mat = matchs[i];
var obj = {};
obj.path = mat.split('"')[1].split('.')[0];
//链接href有斜杠/,即链接有层级,只取最后层
var hpre = mat.split('.')[0];
if (hpre.indexOf('/') >= 0) {
obj.id = hpre.split('/')[1];
} else {
obj.id = hpre.split('"')[1];
}
if (deep > 1) {
mat = matchs[++i];
}
obj.text = mat.split('>')[1].split('<')[0];
arr.push(obj);
}
}
//得到文件名(编码)
var filename = item.id || "0";
zoning.matchdata[filename] = arr;
//记录请求结果数量
zoning.matchcount += 1;
return arr;
},
//外部调用生成下载
zip: function () {
zoning.ziping(zoning.matchdata, zoning.catchdata);
},
//内部调用生成下载
ziping: function (matchdata, catchdata) {
zoning.getScript(zoning.config.urljszip, function () {
zoning.getScript(zoning.config.urlfilesaver, function () {
var zip = new JSZip();
var data = {};
for (var i in matchdata) {
var di = matchdata[i];
for (var j = 0; j < di.length; j++) {
delete di[j].path;
}
data[i] = di;
if (i.length > 1) {
zip.file(i.substr(0, 2) + "/" + i + ".json", JSON.stringify(di));
}
else {
zip.file(i + ".json", JSON.stringify(di));
}
}
zip.file("zoning-" + zoning.config.deepmax + ".json", JSON.stringify(data));
if (catchdata.length) {
zip.file("catch-" + zoning.config.deepmax + ".json", JSON.stringify(catchdata));
}
zip.generateAsync({ type: "blob" }).then(function (content) {
saveAs(content, "zoning-" + zoning.config.deepmax + ".zip");
});
});
});
},
//开始运行
run: function () {
zoning.startTime = new Date().valueOf();
zoning.taskdefer.run = setInterval(function () {
if (zoning.stop) {
clearInterval(zoning.taskdefer.run);
clearInterval(zoning.taskdefer.queuerun);
zoning.zip();
} else if (zoning.pause) {
} else {
console.log("fetch: " + zoning.matchcount + " catch: " + zoning.catchdata.length);
}
}, 1000 * 3);
console.log('fetching ... please see the network tab');
//抓取写入队列
zoning.grab(zoning.config.urlprefix, zoning.config.deep, zoning.config.item);
//消费队列
zoning.queue.run();
}
};
//任务总量
//zoning.taskcount
//抓取数量
//zoning.matchcount
//抓取异常记录
//zoning.catchdata
//抓取结果数据
//zoning.matchdata
//参数配置
zoning.config = {
//jszip CDN
urljszip: "https://lib.baomitu.com/jszip/3.1.4/jszip.min.js",
//fileSaver CDN
urlfilesaver: "https://lib.baomitu.com/FileSaver.js/2014-11-29/FileSaver.min.js",
//抓取首页
urlprefix: "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/",
//抓取过程信息
item: {
//父级编码
id: "0",
//请求相对地址
path: "index"
},
//起始深度
deep: 1,
//最大深度, 4 街道 约3380,5 村 约46800
deepmax: 5,
//发起时间间隔,单位:毫秒(测试200毫秒稳定)
gap: 250,
//抓指定编码,为空时抓所有
//如 ["11", "50"] 表示只抓北京市、重庆市
fetchcode: []
};
//开始运行
zoning.run();
//下载zip,抓取完成后
//zoning.zip();
//注意:网站有速率限制,请求过快会出现验证码和锁IP
// 可尝抓取指定编码,修改时间间隔,浏览器缓存后再高速请求