From 6bd1eed25e0f9b8275909eb3798005e3391ec1b0 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Jun 2017 10:26:55 +0800 Subject: [PATCH 001/257] fix duplicate call of onSuccess and onError #605 --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 528e1988c..690961881 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -410,7 +410,6 @@ private void processRequest(Request request) { } private void onDownloadSuccess(Request request, Page page) { - onSuccess(request); if (site.getAcceptStatCode().contains(page.getStatusCode())){ pageProcessor.process(page); extractAndAddRequests(page, spawnUrl); @@ -431,7 +430,6 @@ private void onDownloaderFail(Request request) { // for cycle retry doCycleRetry(request); } - onError(request); } private void doCycleRetry(Request request) { From 4111b07263579e4e7f714547f1b5bc2e0fa46b4f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Jun 2017 10:36:41 +0800 Subject: [PATCH 002/257] more error log on page code error #601 --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 ++ .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 690961881..62c989f1d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -418,6 +418,8 @@ private void onDownloadSuccess(Request request, Page page) { pipeline.process(page.getResultItems(), this); } } + } else { + logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); return; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7119d107c..6d4442a6f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,7 +76,6 @@ public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } - logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; @@ -86,7 +85,7 @@ public Page download(Request request, Task task) { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, task.getSite().getCharset(), httpResponse, task); onSuccess(request); - logger.debug("downloading page success {}", page); + logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); From 3ee00015c2feae21838756fe9155e18288518e11 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Jun 2017 16:08:39 +0800 Subject: [PATCH 003/257] change header from Authorization to Proxy-Authorization for Proxy Authorization #596 --- .../downloader/HttpUriRequestConverter.java | 3 ++- .../downloader/HttpClientDownloaderTest.java | 23 +++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 354f29efb..af2fef46d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -2,6 +2,7 @@ import org.apache.http.HttpHost; import org.apache.http.auth.AuthState; +import org.apache.http.auth.ChallengeState; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; @@ -41,7 +42,7 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 3f9c83882..1c8efc57f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -10,7 +10,6 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -255,15 +254,21 @@ public void run() throws Exception { }); } - @Ignore("need proxy server") @Test - public void test_download_by_SimpleProxyProvider(){ - HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); - httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 1087))); - Request request = new Request(); - request.setUrl("https://www.baidu.com"); - Page page = httpClientDownloader.download(request, Site.me().toTask()); - assertThat(page.isDownloadSuccess()); + public void test_download_auth_by_SimpleProxyProvider() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password"))); + Request request = new Request(); + request.setUrl("http://www.baidu.com"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); } } From 1b6394bef95e84fb50e177a89186ad2e4e079534 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Jun 2017 16:10:10 +0800 Subject: [PATCH 004/257] version 0.7.2 --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 69ea3e7b1..89852910e 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 pom @@ -233,7 +233,7 @@ 2.10.4 UTF-8 - WebMagic 0.7.2-SNAPSHOT + WebMagic 0.7.2 en_US diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b10b78e8c..dac9bb994 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 3a4571647..19f1ad792 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 5e4a6019c..4bb6ae7b7 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 26e3e6260..2e467829f 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 5c2f3fde4..821db623e 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 9db23209e..290fa7ac9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2-SNAPSHOT + 0.7.2 4.0.0 From db65ec26f9d4139e9835eff047d41b066b38cd8c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Jun 2017 18:54:28 +0800 Subject: [PATCH 005/257] update version in readme --- README-zh.md | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README-zh.md b/README-zh.md index a5953127c..aa04c0697 100644 --- a/README-zh.md +++ b/README-zh.md @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.1 + 0.7.2 us.codecraft webmagic-extension - 0.7.1 + 0.7.2 ``` diff --git a/README.md b/README.md index ef6897ee3..3829aa1f4 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.1 + 0.7.2 us.codecraft webmagic-extension - 0.7.1 + 0.7.2 ``` From 2d69fcf85060c6af8dc65eb470687ec1e61e22d9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 18 Jun 2017 20:45:59 +0800 Subject: [PATCH 006/257] qq qun --- README-zh.md | 3 ++- README.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README-zh.md b/README-zh.md index aa04c0697..be471d31f 100644 --- a/README-zh.md +++ b/README-zh.md @@ -178,7 +178,8 @@ QQ: ### QQ群: -373225642 +373225642(已满) +542327088 ### 相关项目: diff --git a/README.md b/README.md index 3829aa1f4..f2a1115ff 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ To write webmagic, I refered to the projects below : [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) -QQ Group: 373225642 +QQ Group: 373225642 542327088 ### Related Project From 1fd9480496941f261af8792c1f817b7cd86af764 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 18 Jun 2017 20:51:08 +0800 Subject: [PATCH 007/257] remove qq qun 2 --- README-zh.md | 3 +-- README.md | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README-zh.md b/README-zh.md index be471d31f..aa04c0697 100644 --- a/README-zh.md +++ b/README-zh.md @@ -178,8 +178,7 @@ QQ: ### QQ群: -373225642(已满) -542327088 +373225642 ### 相关项目: diff --git a/README.md b/README.md index f2a1115ff..3829aa1f4 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ To write webmagic, I refered to the projects below : [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) -QQ Group: 373225642 542327088 +QQ Group: 373225642 ### Related Project From faca38d4ec91b7f09b292c453d53577ed0a2ff41 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 24 Jun 2017 11:01:23 +0800 Subject: [PATCH 008/257] update jsonpath to 2.2.0 #606 --- pom.xml | 2 +- .../webmagic/selector/JsonPathSelector.java | 16 +++++++++++++--- .../webmagic/selector/JsonPathSelectorTest.java | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 89852910e..1a0139f9b 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,7 @@ com.jayway.jsonpath json-path - 0.8.1 + 2.2.0 org.slf4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index b0b90f9bf..f5c0baeb5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic.selector; +import com.alibaba.fastjson.JSON; import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * JsonPath selector.
@@ -32,12 +34,20 @@ public String select(String text) { if (object instanceof List) { List list = (List) object; if (list != null && list.size() > 0) { - return list.iterator().next().toString(); + return toString(list.iterator().next()); } } return object.toString(); } + private String toString(Object object) { + if (object instanceof Map) { + return JSON.toJSONString(object); + } else { + return String.valueOf(object); + } + } + @Override public List selectList(String text) { List list = new ArrayList(); @@ -48,10 +58,10 @@ public List selectList(String text) { if (object instanceof List) { List items = (List) object; for (Object item : items) { - list.add(String.valueOf(item)); + list.add(toString(item)); } } else { - list.add(String.valueOf(object)); + list.add(toString(object)); } return list; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java index b88e51ee6..6dff0faaf 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java @@ -52,4 +52,5 @@ public void testJsonPath() { JSONObject object2=JSON.parseObject("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}"); assertThat(object1).isEqualTo(object2); } + } From eb376fca745d159029c7204051ecbccf89123fd9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 24 Jun 2017 11:29:35 +0800 Subject: [PATCH 009/257] update jsoup to 1.10.3 #608 --- pom.xml | 2 +- .../us/codecraft/webmagic/selector/Html.java | 18 +----------------- .../java/us/codecraft/webmagic/HtmlTest.java | 1 - 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/pom.xml b/pom.xml index 1a0139f9b..e4b2c841c 100644 --- a/pom.xml +++ b/pom.xml @@ -146,7 +146,7 @@ org.jsoup jsoup - 1.8.3 + 1.10.3 org.mockito diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 7b22639a6..f2218f126 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -3,7 +3,6 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.nodes.Entities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,25 +19,12 @@ public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); - private static volatile boolean INITED = false; - /** * Disable jsoup html entity escape. It can be set just before any Html instance is created. + * @deprecated */ public static boolean DISABLE_HTML_ENTITY_ESCAPE = false; - /** - * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2. - */ - private void disableJsoupHtmlEntityEscape() { - if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { - Entities.EscapeMode.base.getMap().clear(); - Entities.EscapeMode.extended.getMap().clear(); - Entities.EscapeMode.xhtml.getMap().clear(); - INITED = true; - } - } - /** * Store parsed document for better performance when only one text exist. */ @@ -46,7 +32,6 @@ private void disableJsoupHtmlEntityEscape() { public Html(String text, String url) { try { - disableJsoupHtmlEntityEscape(); this.document = Jsoup.parse(text, url); } catch (Exception e) { this.document = null; @@ -56,7 +41,6 @@ public Html(String text, String url) { public Html(String text) { try { - disableJsoupHtmlEntityEscape(); this.document = Jsoup.parse(text); } catch (Exception e) { this.document = null; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index faf249fac..f42f68d74 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -30,7 +30,6 @@ public void testDisableJsoupHtmlEntityEscape() throws Exception { @Test public void testEnableJsoupHtmlEntityEscape() throws Exception { - Html.DISABLE_HTML_ENTITY_ESCAPE = false; Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); } From f405e642c0c7e0e1181716e14db8309b9c1864a2 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 24 Jun 2017 11:31:17 +0800 Subject: [PATCH 010/257] add default constructor for HttpRequestBody #609 --- .../webmagic/model/HttpRequestBody.java | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 9d5f85510..abd3d5bd7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -29,11 +29,14 @@ public static abstract class ContentType { public static final String MULTIPART = "multipart/form-data"; } - private final byte[] body; + private byte[] body; - private final String contentType; + private String contentType; - private final String encoding; + private String encoding; + + public HttpRequestBody() { + } public HttpRequestBody(byte[] body, String contentType, String encoding) { this.body = body; @@ -49,6 +52,18 @@ public String getEncoding() { return encoding; } + public void setBody(byte[] body) { + this.body = body; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + public void setEncoding(String encoding) { + this.encoding = encoding; + } + public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } From 486a6d5c9301f5ef3fac563677669a7bf5d11759 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 27 Jun 2017 17:37:48 +0800 Subject: [PATCH 011/257] =?UTF-8?q?qq=E7=BE=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-zh.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README-zh.md b/README-zh.md index aa04c0697..88fa0d2e7 100644 --- a/README-zh.md +++ b/README-zh.md @@ -178,7 +178,7 @@ QQ: ### QQ群: -373225642 +373225642(已满) 542327088 ### 相关项目: diff --git a/README.md b/README.md index 3829aa1f4..f2a1115ff 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ To write webmagic, I refered to the projects below : [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) -QQ Group: 373225642 +QQ Group: 373225642 542327088 ### Related Project From c3bdb204580cf7618bf70e67992ab14e227156dd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Jul 2017 11:49:23 +0800 Subject: [PATCH 012/257] #631 remove IllegalArgumentException of HttpRequestBody.json and so on --- .../webmagic/model/HttpRequestBody.java | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index abd3d5bd7..7d3b30785 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,24 +64,36 @@ public void setEncoding(String encoding) { this.encoding = encoding; } - public static HttpRequestBody json(String json, String encoding) throws UnsupportedEncodingException { - return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + public static HttpRequestBody json(String json, String encoding) { + try { + return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } } - public static HttpRequestBody xml(String xml, String encoding) throws UnsupportedEncodingException { - return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + public static HttpRequestBody xml(String xml, String encoding) { + try { + return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } } - public static HttpRequestBody custom(byte[] body, String contentType, String encoding) throws UnsupportedEncodingException { + public static HttpRequestBody custom(byte[] body, String contentType, String encoding) { return new HttpRequestBody(body, contentType, encoding); } - public static HttpRequestBody form(Map params, String encoding) throws UnsupportedEncodingException { + public static HttpRequestBody form(Map params, String encoding){ List nameValuePairs = new ArrayList(params.size()); for (Map.Entry entry : params.entrySet()) { nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); } - return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + try { + return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } } public byte[] getBody() { From 2183ba9b61a766f94d23ee62d6ee07b219a75f9d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Jul 2017 12:11:00 +0800 Subject: [PATCH 013/257] #571 add getBytes to Page --- .../main/java/us/codecraft/webmagic/Page.java | 10 ++++++++ .../java/us/codecraft/webmagic/Request.java | 14 +++++++++++ .../downloader/HttpClientDownloader.java | 23 +++++++++++-------- .../downloader/HttpClientDownloaderTest.java | 18 +++++++++++++++ 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index a945607b2..758e4c681 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -46,6 +46,8 @@ public class Page { private boolean downloadSuccess = true; + private byte[] bytes; + private List targetRequests = new ArrayList(); public Page() { @@ -228,6 +230,14 @@ public void setDownloadSuccess(boolean downloadSuccess) { this.downloadSuccess = downloadSuccess; } + public byte[] getBytes() { + return bytes; + } + + public void setBytes(byte[] bytes) { + this.bytes = bytes; + } + @Override public String toString() { return "Page{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index f29ccb32b..a41de900e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -45,6 +45,12 @@ public class Request implements Serializable { */ private long priority; + /** + * When it is set to TRUE, the downloader will not try to parse response body to text. + * + */ + private boolean binarayContent = false; + public Request() { } @@ -162,6 +168,14 @@ public void setRequestBody(HttpRequestBody requestBody) { this.requestBody = requestBody; } + public boolean isBinarayContent() { + return binarayContent; + } + + public void setBinarayContent(boolean binarayContent) { + this.binarayContent = binarayContent; + } + @Override public String toString() { return "Request{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 6d4442a6f..5d0b033e8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -108,9 +108,13 @@ public void setThread(int thread) { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = getResponseContent(charset, httpResponse); + byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); - page.setRawText(content); + page.setBytes(bytes); + if (!request.isBinarayContent()){ + page.setRawText(getResponseContent(charset, contentType, bytes)); + } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); @@ -121,22 +125,21 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException { + private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException { if (charset == null) { - byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); - String htmlCharset = getHtmlCharset(httpResponse, contentBytes); + String htmlCharset = getHtmlCharset(contentType, bytes); if (htmlCharset != null) { - return new String(contentBytes, htmlCharset); + return new String(bytes, htmlCharset); } else { logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); - return new String(contentBytes); + return new String(bytes); } } else { - return IOUtils.toString(httpResponse.getEntity().getContent(), charset); + return new String(bytes, charset); } } - private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { - return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(), contentBytes); + private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + return CharsetUtils.detectCharset(contentType, contentBytes); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 1c8efc57f..cbb7abc0a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -271,4 +271,22 @@ public void run() throws Exception { }); } + @Test + public void test_download_binary_content() throws Exception { + HttpServer server = httpServer(13423); + server.response("binary"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setBinarayContent(true); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isNull(); + assertThat(page.getBytes()).isEqualTo("binary".getBytes()); + } + }); + } + } From 5daf92e8b294fbf75169050b98db6a12a06acdf4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Jul 2017 17:27:36 +0800 Subject: [PATCH 014/257] #610 CASE_INSENSITIVE for charset detect in Content-Type --- .../src/main/java/us/codecraft/webmagic/utils/UrlUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 686460662..87a6a5670 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -108,7 +108,7 @@ public static List convertToUrls(Collection requests) { return urlList; } - private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); From 3266ea15cab0218551bc7577ccc754f2feb30799 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 22 Jul 2017 17:40:43 +0800 Subject: [PATCH 015/257] #629 correct illegal url in HttpUriRequestConverter --- .../downloader/HttpUriRequestConverter.java | 2 +- .../us/codecraft/webmagic/utils/UrlUtils.java | 9 ++++-- .../HttpUriRequestConverterTest.java | 31 +++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index af2fef46d..28a7ce5ea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -58,7 +58,7 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P } private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { - RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 87a6a5670..c61483a39 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -43,7 +43,7 @@ public static String canonicalizeUrl(String url, String refer) { if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); - return encodeIllegalCharacterInUrl(abs.toExternalForm()); + return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } @@ -53,12 +53,17 @@ public static String canonicalizeUrl(String url, String refer) { * * @param url url * @return new url + * @deprecated */ public static String encodeIllegalCharacterInUrl(String url) { - //TODO more charator support return url.replace(" ", "%20"); } + public static String fixIllegalCharacterInUrl(String url) { + //TODO more charator support + return url.replace(" ", "%20").replaceAll("#+", "#"); + } + public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java new file mode 100644 index 000000000..15902e86d --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.net.URI; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/7/22 + * Time: 下午5:29 + */ +public class HttpUriRequestConverterTest { + + @Test(expected = IllegalArgumentException.class) + public void test_illegal_uri() throws Exception { + HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + httpUriRequestConverter.convert(new Request("http://bj.zhongkao.com/beikao/yimo/##"), Site.me(), null); + } + + @Test + public void test_illegal_uri_correct() throws Exception { + HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null); + assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#")); + } +} \ No newline at end of file From f375b9fdbae7db265b05a536720e2f591d1a8e9d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 25 Jul 2017 18:27:10 +0800 Subject: [PATCH 016/257] #629 fix ut --- .../webmagic/downloader/HttpUriRequestConverterTest.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java index 15902e86d..e7da1b9ab 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java @@ -16,12 +16,6 @@ */ public class HttpUriRequestConverterTest { - @Test(expected = IllegalArgumentException.class) - public void test_illegal_uri() throws Exception { - HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - httpUriRequestConverter.convert(new Request("http://bj.zhongkao.com/beikao/yimo/##"), Site.me(), null); - } - @Test public void test_illegal_uri_correct() throws Exception { HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); From 65049baca47b77ce77d65235ef8a5e566406191d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 29 Jul 2017 11:06:17 +0800 Subject: [PATCH 017/257] #571 fix spell mistake --- .../src/main/java/us/codecraft/webmagic/Page.java | 15 ++++++++++++++- .../main/java/us/codecraft/webmagic/Request.java | 10 +++++----- .../webmagic/downloader/HttpClientDownloader.java | 2 +- .../downloader/HttpClientDownloaderTest.java | 2 +- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 758e4c681..c11df693c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -49,6 +50,8 @@ public class Page { private byte[] bytes; private List targetRequests = new ArrayList(); + + private String charset; public Page() { } @@ -238,6 +241,14 @@ public void setBytes(byte[] bytes) { this.bytes = bytes; } + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = charset; + } + @Override public String toString() { return "Page{" + @@ -249,8 +260,10 @@ public String toString() { ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + - ", success=" + downloadSuccess + + ", downloadSuccess=" + downloadSuccess + ", targetRequests=" + targetRequests + + ", charset='" + charset + '\'' + + ", bytes=" + Arrays.toString(bytes) + '}'; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index a41de900e..9d0b9ccf4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -49,7 +49,7 @@ public class Request implements Serializable { * When it is set to TRUE, the downloader will not try to parse response body to text. * */ - private boolean binarayContent = false; + private boolean binaryContent = false; public Request() { } @@ -168,12 +168,12 @@ public void setRequestBody(HttpRequestBody requestBody) { this.requestBody = requestBody; } - public boolean isBinarayContent() { - return binarayContent; + public boolean isBinaryContent() { + return binaryContent; } - public void setBinarayContent(boolean binarayContent) { - this.binarayContent = binarayContent; + public void setBinaryContent(boolean binaryContent) { + this.binaryContent = binaryContent; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 5d0b033e8..13175fc44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -112,7 +112,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinarayContent()){ + if (!request.isBinaryContent()){ page.setRawText(getResponseContent(charset, contentType, bytes)); } page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index cbb7abc0a..6a1c8319b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -280,7 +280,7 @@ public void test_download_binary_content() throws Exception { public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); - request.setBinarayContent(true); + request.setBinaryContent(true); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isNull(); From 32f1f2cf44e76a3b1dea048dd37f5cedcfeae80b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 29 Jul 2017 11:16:09 +0800 Subject: [PATCH 018/257] #613 add charset to page --- .../downloader/HttpClientDownloader.java | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 13175fc44..4e19e7ccb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -113,7 +113,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ - page.setRawText(getResponseContent(charset, contentType, bytes)); + if (charset == null) { + charset = getHtmlCharset(contentType, bytes); + } + page.setCharset(charset); + page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); @@ -125,21 +129,12 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - String htmlCharset = getHtmlCharset(contentType, bytes); - if (htmlCharset != null) { - return new String(bytes, htmlCharset); - } else { - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); - return new String(bytes); - } - } else { - return new String(bytes, charset); + charset = Charset.defaultCharset().name(); + logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); } - } - - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { - return CharsetUtils.detectCharset(contentType, contentBytes); + return charset; } } From 6f5b9e448e022ad4e72bc5a1e60a2bb71d422a37 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 29 Jul 2017 11:27:56 +0800 Subject: [PATCH 019/257] #627 set charset to request --- .../java/us/codecraft/webmagic/Request.java | 10 ++++++ .../downloader/HttpClientDownloader.java | 2 +- .../downloader/HttpClientDownloaderTest.java | 33 +++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9d0b9ccf4..938f0e870 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -51,6 +51,8 @@ public class Request implements Serializable { */ private boolean binaryContent = false; + private String charset; + public Request() { } @@ -176,6 +178,14 @@ public void setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; } + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = charset; + } + @Override public String toString() { return "Request{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 4e19e7ccb..fff7c7cfa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -83,7 +83,7 @@ public Page download(Request request, Task task) { Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); - page = handleResponse(request, task.getSite().getCharset(), httpResponse, task); + page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 6a1c8319b..04a45a020 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -289,4 +289,37 @@ public void run() throws Exception { }); } + @Test + public void test_download_set_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response(header("Content-Type","text/html; charset=utf-8")).response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + + @Test + public void test_download_set_request_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setCharset("utf-8"); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().setCharset("gbk").toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + } From a7e309071156c99a17e0b681aa0141f8fc4ff88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E4=BE=83?= Date: Sat, 29 Jul 2017 19:07:40 +0800 Subject: [PATCH 020/257] =?UTF-8?q?setBinaryContent,=20setCharset=20?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E8=BF=94=E5=9B=9Ethis,=20=E6=96=B9=E4=BE=BF?= =?UTF-8?q?=E9=93=BE=E5=BC=8F=E8=B0=83=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Request.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 938f0e870..eefd91bb5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -174,16 +174,18 @@ public boolean isBinaryContent() { return binaryContent; } - public void setBinaryContent(boolean binaryContent) { + public Request setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; + return this; } public String getCharset() { return charset; } - public void setCharset(String charset) { + public Request setCharset(String charset) { this.charset = charset; + return this; } @Override From f3847b7cd4cac66ae05d318a821ba82217432928 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 30 Jul 2017 15:27:41 +0800 Subject: [PATCH 021/257] #upgrade jsonpath to 2.4.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e4b2c841c..866b26a39 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,7 @@ com.jayway.jsonpath json-path - 2.2.0 + 2.4.0 org.slf4j From e276b11ea9be727bdf4babb1f1b78155ef295377 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 30 Jul 2017 15:29:22 +0800 Subject: [PATCH 022/257] vesion 0.7.3 --- README-zh.md | 4 ++-- README.md | 4 ++-- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README-zh.md b/README-zh.md index 88fa0d2e7..cd1b090c7 100644 --- a/README-zh.md +++ b/README-zh.md @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.2 + 0.7.3 us.codecraft webmagic-extension - 0.7.2 + 0.7.3 ``` diff --git a/README.md b/README.md index f2a1115ff..73cb48833 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.2 + 0.7.3 us.codecraft webmagic-extension - 0.7.2 + 0.7.3 ``` diff --git a/pom.xml b/pom.xml index 866b26a39..0765ae132 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.7.2 + 0.7.3 4.0.0 pom @@ -233,7 +233,7 @@ 2.10.4 UTF-8 - WebMagic 0.7.2 + WebMagic 0.7.3 en_US diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index dac9bb994..e889cd491 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.2 + 0.7.3 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 19f1ad792..7e949ca6f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.2 + 0.7.3 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 4bb6ae7b7..072bb3fd5 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2 + 0.7.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2e467829f..95f706ed5 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2 + 0.7.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 821db623e..22956cb55 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2 + 0.7.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 290fa7ac9..1cbf59216 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.2 + 0.7.3 4.0.0 From a67f60b01b54f6789cba4d227d9292dda5b25ac0 Mon Sep 17 00:00:00 2001 From: Yao Date: Wed, 30 Aug 2017 18:42:00 +0800 Subject: [PATCH 023/257] fix the typo --- README-zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README-zh.md b/README-zh.md index cd1b090c7..65d5d1729 100644 --- a/README-zh.md +++ b/README-zh.md @@ -93,7 +93,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java -public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcessor implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net"); @@ -113,7 +113,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog") .addPipeline(new ConsolePipeline()).run(); } } From b539522ca8431d804b95b2ced414e5b43415e9f6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:36:19 +0800 Subject: [PATCH 024/257] #701 support to tls1.2 --- pom.xml | 5 ++++ .../downloader/HttpClientGenerator.java | 5 +++- .../downloader/HttpClientDownloaderTest.java | 1 + .../downloader/SSLCompatibilityTest.java | 26 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java diff --git a/pom.xml b/pom.xml index 0765ae132..84ce1152f 100644 --- a/pom.xml +++ b/pom.xml @@ -75,6 +75,11 @@ httpclient 4.5.2 + + org.apache.httpcomponents + httpcore + 4.5.2 + com.google.guava guava diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 562f36f6f..28a16f41d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -9,6 +9,7 @@ import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; @@ -49,7 +50,9 @@ public HttpClientGenerator() { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书 + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + null, + new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 04a45a020..ece060003 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -322,4 +322,5 @@ public void run() throws Exception { }); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java new file mode 100644 index 000000000..861b315a6 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/11/29 + * Time: 下午1:32 + */ +public class SSLCompatibilityTest { + + @Test + public void test_tls12() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Task task = Site.me().setCycleRetryTimes(5).toTask(); + Request request = new Request("https://juejin.im/"); + Page page = httpClientDownloader.download(request, task); + assertThat(page.isDownloadSuccess()).isTrue(); + } +} From e5db538c19188902592ea2f702e0860fc3eba600 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:49:40 +0800 Subject: [PATCH 025/257] #647 remove ThreadSafe annotation --- .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 2 -- .../main/java/us/codecraft/webmagic/pipeline/FilePipeline.java | 3 --- .../us/codecraft/webmagic/scheduler/PriorityScheduler.java | 2 -- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 2 -- .../us/codecraft/webmagic/downloader/PhantomJSDownloader.java | 2 -- 5 files changed, 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fff7c7cfa..24889c88b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,7 +2,6 @@ import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; -import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; @@ -30,7 +29,6 @@ * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 57d6eea3f..be9fd7cc2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,8 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; @@ -21,7 +19,6 @@ * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 8fa1b9ea2..14cbaff32 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; @@ -16,7 +15,6 @@ * @author code4crafter@gmail.com
* @since 0.2.1 */ -@ThreadSafe public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 078506c6f..f9ad0e98f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -15,7 +14,6 @@ * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 0fda351b9..6055bdb0f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -16,7 +15,6 @@ * @author dolphineor@gmail.com * @version 0.5.3 */ -@ThreadSafe public class PhantomJSDownloader extends AbstractDownloader { private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); From 266083fa074819232a02d359566be81ff687da87 Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Wed, 29 Nov 2017 20:19:00 +0800 Subject: [PATCH 026/257] =?UTF-8?q?[Fix]=20#698=C2=A0=20Repair=20using=20r?= =?UTF-8?q?edis,Request=20additional=20information=20is=20lost?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/scheduler/RedisScheduler.java | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index ce1111f24..1e94971f3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -2,6 +2,7 @@ import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -60,7 +61,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (request.getExtras() != null) { + if (CheckForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -70,6 +71,33 @@ protected void pushWhenNoDuplicate(Request request, Task task) { } } + private boolean CheckForAdditionalInfo(Request request) { + if (request == null) { + return false; + } + + if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { + return true; + } + + if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { + return true; + } + + if (request.isBinaryContent() || request.getRequestBody() != null) { + return true; + } + + if (request.getExtras() != null && !request.getExtras().isEmpty()) { + return true; + } + if (request.getPriority() != 0L) { + return true; + } + + return false; + } + @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); @@ -85,7 +113,7 @@ public synchronized Request poll(Task task) { Request o = JSON.parseObject(new String(bytes), Request.class); return o; } - Request request = new Request(url); + Request request = new Request(url); return request; } finally { pool.returnResource(jedis); @@ -100,8 +128,7 @@ protected String getQueueKey(Task task) { return QUEUE_PREFIX + task.getUUID(); } - protected String getItemKey(Task task) - { + protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } From adf545483797392333135a3d7900b31b71110d9d Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Thu, 30 Nov 2017 11:35:12 +0800 Subject: [PATCH 027/257] =?UTF-8?q?[Fix]=20=E4=BF=AE=E6=AD=A3=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E6=96=B9=E6=B3=95=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f3..ee04f35cf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,7 +61,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -71,7 +71,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c701fe8d38c8060e97df3efab64ae4a0d94c0245 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 30 Nov 2017 11:50:52 +0800 Subject: [PATCH 028/257] #702 Refactor: rename CheckForAdditionalInfo to checkForAdditionalInfo --- .../us/codecraft/webmagic/scheduler/RedisScheduler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f3..c70d88507 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,17 +61,17 @@ protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { - pool.returnResource(jedis); + jedis.close(); } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c7d1ed7d201515fbf479dcb62c612711af56070a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:50:49 +0800 Subject: [PATCH 029/257] #fix httpcore version: change to 4.4.4 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 84ce1152f..2b2384fd8 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ org.apache.httpcomponents httpcore - 4.5.2 + 4.4.4 com.google.guava From be892b80bf6682cd063d30ac25a79be0c079a901 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:57:06 +0800 Subject: [PATCH 030/257] update travis ci to openjdk --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a9f233f37..9e6f78d38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - oraclejdk7 + - openjdk7 From b74264be6981e72b6785fc7fc02b5e22feabe7a2 Mon Sep 17 00:00:00 2001 From: singinginwind Date: Wed, 28 Mar 2018 17:08:26 +0800 Subject: [PATCH 031/257] =?UTF-8?q?:=20=E6=98=AF=E9=9D=9E=E6=B3=95?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=EF=BC=8C=E6=97=A0=E6=B3=95=E4=BD=9C=E4=B8=BA?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit : 是非法字符,无法作为文件名 --- .../java/us/codecraft/webmagic/model/samples/GithubRepo.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index e8998eca0..422dffd35 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -48,7 +48,7 @@ public static void main(String[] args) { @Override public String key() { - return author+":"+name; + return author+"_"+name; } public String getName() { From f1b3a29d6ff09efefef343c7f3d697e51a3eeea5 Mon Sep 17 00:00:00 2001 From: snyk-test Date: Fri, 28 Jun 2019 01:31:36 +0000 Subject: [PATCH 032/257] fix: webmagic-selenium/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEHTTPCOMPONENTS-31517 --- webmagic-selenium/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 1cbf59216..e88cce5c9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -13,7 +13,7 @@ org.seleniumhq.selenium selenium-java - 2.41.0 + 3.0.0 us.codecraft From 2fd0e192fdef9316d93d101b4fc333ac9ae13fd2 Mon Sep 17 00:00:00 2001 From: Thomas Perkins Date: Mon, 29 Jul 2019 15:58:52 +0100 Subject: [PATCH 033/257] Add unit tests for us.codecraft.webmagic.utils.NumberUtils These tests were written using Diffblue Cover. --- .../webmagic/utils/NumberUtilsTest.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java new file mode 100644 index 000000000..f9e725e29 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Assert; +import org.junit.Test; + +public class NumberUtilsTest { + + @Test + public void testCompareLong() { + Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L)); + Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L)); + Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L)); + } +} From 0b8fab1bfa87429e2be6af6375e21be953bbd713 Mon Sep 17 00:00:00 2001 From: snyk-test Date: Mon, 2 Sep 2019 02:58:48 +0000 Subject: [PATCH 034/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-174736 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32043 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32044 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32111 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-450207 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-450917 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-455617 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72445 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72446 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72447 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72882 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72883 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72884 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-30433 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-31515 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 7fd58a94a8c3f566a78afddc06124609e916b6a2 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 3 Oct 2019 02:59:35 +0000 Subject: [PATCH 035/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-469674 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-469676 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From fd6037af26c71a3b30d82382fcf680897ac6acc3 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 8 Oct 2019 02:58:45 +0000 Subject: [PATCH 036/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-471943 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 09fb39f431e84eb4f87aaaef7abe506659ed8231 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 15 Oct 2019 02:58:57 +0000 Subject: [PATCH 037/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-472980 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8815934f88a49d8a96b82b9e6918d0f02cf9c7ff Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 4 Jan 2020 02:58:56 +0000 Subject: [PATCH 038/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-540500 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8335d18c79decf58f41fe94b418dc720d9886f91 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 12 Feb 2020 02:58:51 +0000 Subject: [PATCH 039/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-548451 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From cc2d7af70a5a8adba06071494c1f9d87df7853ca Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 3 Mar 2020 02:59:56 +0000 Subject: [PATCH 040/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-559094 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From c79cd1dfa07304dee4ac299fc131c7c888024315 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sun, 22 Mar 2020 02:58:48 +0000 Subject: [PATCH 041/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560762 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560766 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 95da41f08b1c68e2c0ff04bd0d770b35db278e6c Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 28 Mar 2020 02:58:44 +0000 Subject: [PATCH 042/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560762 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560766 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561362 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561373 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 38eedabef99b7c5e639f3fac6c408fa59ee8b091 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 1 Apr 2020 02:58:50 +0000 Subject: [PATCH 043/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561585 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561586 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561587 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8327e482c9baa1038fcd89bcbe60d7ec625b7303 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 9 Apr 2020 02:58:45 +0000 Subject: [PATCH 044/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-564887 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-564888 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..a93ebb3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From fe3d52e2a439d76025c26d258bd7f74472947548 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 11 Apr 2020 18:00:04 +0800 Subject: [PATCH 045/257] Add TLSv1.3 support. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 28a16f41d..6409f568e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -50,7 +50,7 @@ public HttpClientGenerator() { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { From b98a87e45a2cc51f75a386f3939b01679a5fd347 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 11 Apr 2020 20:21:20 +0800 Subject: [PATCH 046/257] Serialize requests in FileCacheQueueScheduler, so that the extra info of request could be restored. --- .../scheduler/FileCacheQueueScheduler.java | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 6ca982853..37310e6f0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.scheduler; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.math.NumberUtils; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -141,7 +143,7 @@ private void readUrlFile() throws IOException { urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line)); + queue.add(deserializeRequest(line)); } } } finally { @@ -183,7 +185,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { init(task); } queue.add(request); - fileUrlWriter.println(request.getUrl()); + fileUrlWriter.println(serializeRequest(request)); } @Override @@ -204,4 +206,22 @@ public int getLeftRequestsCount(Task task) { public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } + + protected String serializeRequest(Request request) { + String line = String.format("%1$s\t%2$s", request.getUrl(), + Base64.encodeBase64String(SerializationUtils.serialize(request))); + return line; + } + + protected Request deserializeRequest(String line) { + Request request; + String[] sections = line.split("\t"); + if (sections.length >= 2) { + request = (Request) SerializationUtils.deserialize(Base64.decodeBase64(sections[1])); + } else { + request = new Request(sections[0]); + } + return request; + } + } From c46400d126998dbe043d1495c839d52409941c94 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 12 Apr 2020 01:30:57 +0800 Subject: [PATCH 047/257] Fix javadoc of sleep time. Fixes #918. --- webmagic-core/src/main/java/us/codecraft/webmagic/Site.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b6963ca43..72cc7d058 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -203,7 +203,7 @@ public Set getAcceptStatCode() { /** * Set the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @param sleepTime sleepTime * @return this @@ -215,7 +215,7 @@ public Site setSleepTime(int sleepTime) { /** * Get the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @return the interval between the processing of two pages, */ From e7476cb8dc67439159f7ffbf85d4e56f87810eea Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 19 Apr 2020 22:44:06 +0800 Subject: [PATCH 048/257] Make Request#getExtra be generic. --- .../src/main/java/us/codecraft/webmagic/Request.java | 7 ++++--- .../src/main/java/us/codecraft/webmagic/ResultItems.java | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index eefd91bb5..5c26d20dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -78,14 +78,15 @@ public Request setPriority(long priority) { return this; } - public Object getExtra(String key) { + @SuppressWarnings("unchecked") + public T getExtra(String key) { if (extras == null) { return null; } - return extras.get(key); + return (T) extras.get(key); } - public Request putExtra(String key, Object value) { + public Request putExtra(String key, T value) { if (extras == null) { extras = new HashMap(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 7b543613d..488c81e77 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -21,6 +20,7 @@ public class ResultItems { private boolean skip; + @SuppressWarnings("unchecked") public T get(String key) { Object o = fields.get(key); if (o == null) { From 30667f468705c61d78a91288046ca317dd9f94a8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 21 May 2020 18:51:37 +0800 Subject: [PATCH 049/257] Remove oss-parent setting as it is no longer active. --- pom.xml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pom.xml b/pom.xml index 2b2384fd8..b44377084 100644 --- a/pom.xml +++ b/pom.xml @@ -1,10 +1,5 @@ - - org.sonatype.oss - oss-parent - 7 - us.codecraft 0.7.3 4.0.0 From ba1b4017a7399d7fa073b309d5376e5f3463214d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 21 May 2020 19:59:29 +0800 Subject: [PATCH 050/257] Mark slf4j-log4j12 as optional. --- pom.xml | 2 +- webmagic-core/pom.xml | 1 + .../codecraft/webmagic/example/PatternProcessorExample.java | 6 ++++-- .../java/us/codecraft/webmagic/selector/Xpath2Selector.java | 5 +++-- webmagic-scripts/pom.xml | 4 ++++ .../webmagic/downloader/selenium/SeleniumDownloader.java | 6 ++++-- .../webmagic/downloader/selenium/WebDriverPool.java | 5 +++-- 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index b44377084..161d62f83 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git webmagic-parent-0.6.1 - + Apache License, Version 2.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index e889cd491..66e455d34 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -48,6 +48,7 @@ org.slf4j slf4j-log4j12 + true diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java index 8ecb08fe6..9406abfd2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.example; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.*; import us.codecraft.webmagic.handler.CompositePageProcessor; import us.codecraft.webmagic.handler.CompositePipeline; @@ -15,7 +17,7 @@ */ public class PatternProcessorExample { - private static Logger log = Logger.getLogger(PatternProcessorExample.class); + private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class); public static void main(String... args) { diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 98b1efe4b..d8aab6cce 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -2,11 +2,12 @@ import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; -import org.apache.log4j.Logger; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -40,7 +41,7 @@ public class Xpath2Selector implements Selector { private XPathExpression xPathExpression; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 22956cb55..9dbc7b393 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -49,6 +49,10 @@ webmagic-core ${project.version} + + org.slf4j + slf4j-log4j12 + us.codecraft webmagic-extension diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index f45f7e2a8..cce293fc9 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -29,7 +31,7 @@ public class SeleniumDownloader implements Downloader, Closeable { private volatile WebDriverPool webDriverPool; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private int sleepTime = 0; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 1472cb32c..e1d9dd039 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; @@ -8,6 +7,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileReader; import java.io.IOException; @@ -27,7 +28,7 @@ * Time: 下午1:41
*/ class WebDriverPool { - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private final static int DEFAULT_CAPACITY = 5; From 436af973465797f3a2e865cce4bf6d1b0701362e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 13:59:05 +0800 Subject: [PATCH 051/257] Use spaces as indent. --- .../downloader/HttpClientGenerator.java | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 6409f568e..9c3891656 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -34,9 +34,9 @@ * @since 0.4.0 */ public class HttpClientGenerator { - + private transient Logger logger = LoggerFactory.getLogger(getClass()); - + private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { @@ -48,43 +48,43 @@ public HttpClientGenerator() { connectionManager.setDefaultMaxPerRoute(100); } - private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { - try { + private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { + try { return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } - return SSLConnectionSocketFactory.getSocketFactory(); + return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 - X509TrustManager trustManager = new X509TrustManager() { - - @Override - public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - - }; - - SSLContext sc = SSLContext.getInstance("SSLv3"); - sc.init(null, new TrustManager[] { trustManager }, null); - return sc; + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + X509TrustManager trustManager = new X509TrustManager() { + + @Override + public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + }; + + SSLContext sc = SSLContext.getInstance("SSLv3"); + sc.init(null, new TrustManager[] { trustManager }, null); + return sc; } - + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; @@ -96,7 +96,7 @@ public CloseableHttpClient getClient(Site site) { private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); - + httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); From 3e425231414d41bd2d7ce1500d3e36fb610754fb Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:14:16 +0800 Subject: [PATCH 052/257] TLSv1.3 requires Java 11 at least. Fixes #927. --- pom.xml | 2 +- .../downloader/HttpClientGenerator.java | 40 +++++++++++++------ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 161d62f83..eca252349 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ org.apache.commons commons-lang3 - 3.1 + 3.10 commons-collections diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 9c3891656..d932de948 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,5 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.lang3.JavaVersion; +import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; @@ -11,23 +24,18 @@ import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.*; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Site; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; +import us.codecraft.webmagic.Site; /** * @author code4crafter@gmail.com
@@ -50,7 +58,15 @@ public HttpClientGenerator() { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, + SSLContext sslContext = createIgnoreVerifySSL(); + String[] supportedProtocols; + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + } else { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + } + logger.info("supportedProtocols: {}", String.join(", ", supportedProtocols)); + return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { From 3510e74d3f024a30aaf7355be7eb3b035c53fc3e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:18:53 +0800 Subject: [PATCH 053/257] Travis supports openjdk relase number is 9 to 15 now. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9e6f78d38..8f79da0cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - openjdk7 + - openjdk9 From 4078766d0e0edd510ff5f7071772e9de96420ee3 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:30:14 +0800 Subject: [PATCH 054/257] Change log level of supportedProtocols. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index d932de948..ee94581ad 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -65,7 +65,7 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { } else { supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; } - logger.info("supportedProtocols: {}", String.join(", ", supportedProtocols)); + logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 From ab5ac9d7969ec73354700670741e02098b23a597 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 19:05:17 +0800 Subject: [PATCH 055/257] Fix test failure and javadoc failure. --- pom.xml | 19 +++++++++++-------- .../webmagic/selector/RegexSelector.java | 2 +- webmagic-scripts/pom.xml | 16 ---------------- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/pom.xml b/pom.xml index eca252349..06a32130b 100644 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,8 @@ UTF-8 UTF-8 + 1.8 4.0.0.RELEASE - webmagic-parent webmagic-parent @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.28 + 1.2.56 com.github.dreamhead @@ -162,7 +162,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.18 + 3.0.0-M4 0 @@ -170,11 +170,10 @@ org.apache.maven.plugins maven-compiler-plugin - 3.1 + 3.7.0 - 1.6 - 1.6 - UTF-8 + ${java.version} + ${java.version} @@ -230,11 +229,15 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + 3.2.0 UTF-8 WebMagic 0.7.3 en_US + + + false + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 9ae538c0f..fb0a161d2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -41,7 +41,7 @@ private void compileRegex(String regexStr) { /** * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1. - * @param regexStr + * @param regexStr the regular expression. */ public RegexSelector(String regexStr) { this.compileRegex(regexStr); diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 9dbc7b393..94f08f02a 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -7,7 +7,6 @@ 4.0.0 - us.codecraft webmagic-scripts 1.1.2-2 @@ -63,21 +62,6 @@ ${project.basedir}/src/main/java - - maven-compiler-plugin - - 1.6 - 1.6 - UTF-8 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - org.apache.maven.plugins maven-jar-plugin From fe9dca12477ca86588de79522aa3ed1a9e26f9dd Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 14:28:34 +0800 Subject: [PATCH 056/257] Upgrade guava from 15.0 to 29.0. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aefa8b07b..d6e226ee8 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ com.google.guava guava - 15.0 + 29.0-jre com.jayway.jsonpath From 98281ab26e67f4309a4a0b7b4b41ca1a66e74de7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 14:43:59 +0800 Subject: [PATCH 057/257] Upgrade httpclient from 4.5.2 to 4.5.12. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d6e226ee8..87c47ff52 100644 --- a/pom.xml +++ b/pom.xml @@ -68,7 +68,7 @@ org.apache.httpcomponents httpclient - 4.5.2 + 4.5.12 org.apache.httpcomponents From a7c4e701e417ba54d8df25912b48c69f290dfdab Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 18:12:05 +0800 Subject: [PATCH 058/257] Specify the required minimum maven version. --- pom.xml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pom.xml b/pom.xml index 87c47ff52..ee086c85b 100644 --- a/pom.xml +++ b/pom.xml @@ -159,6 +159,26 @@ + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + enforce-maven + + enforce + + + + + 3.0.5 + + + + + + org.apache.maven.plugins maven-surefire-plugin From 6c05dd8b725982eb78d8f0042c2ca2aa04f70cf8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 18:19:45 +0800 Subject: [PATCH 059/257] Upgrade maven plugins. --- pom.xml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index ee086c85b..2186b742d 100644 --- a/pom.xml +++ b/pom.xml @@ -190,7 +190,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.7.0 + 3.8.1 ${java.version} ${java.version} @@ -219,10 +219,7 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 - - UTF-8 - + 3.1.0 org.apache.maven.plugins @@ -236,7 +233,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.2.1 attach-sources @@ -278,7 +275,7 @@ org.apache.maven.plugins maven-release-plugin - 2.4.1 + 3.0.0-M1 @@ -333,7 +330,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6 + 1.6.8 true sonatype-nexus-staging From 71aa04c89f27f6a74ca981f19b3f9cc38a7c29d5 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 30 May 2020 02:10:01 +0800 Subject: [PATCH 060/257] Upgrade dependencies. --- pom.xml | 61 +++++++++++++++++++++++++++++++------- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 6 ++-- webmagic-samples/pom.xml | 4 +-- webmagic-saxon/pom.xml | 4 +-- webmagic-scripts/pom.xml | 12 +++----- webmagic-selenium/pom.xml | 7 +---- 7 files changed, 61 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 2186b742d..1a5853ad0 100644 --- a/pom.xml +++ b/pom.xml @@ -56,7 +56,7 @@ junit junit - 4.11 + 4.13 test @@ -73,7 +73,7 @@ org.apache.httpcomponents httpcore - 4.4.4 + 4.4.13 com.google.guava @@ -88,12 +88,12 @@ org.slf4j slf4j-api - 1.7.6 + 1.7.30 org.slf4j slf4j-log4j12 - 1.7.6 + 1.7.30 us.codecraft @@ -103,12 +103,12 @@ com.alibaba fastjson - 1.2.56 + 1.2.68 com.github.dreamhead moco-core - 1.0.0 + 1.1.0 test @@ -125,7 +125,7 @@ org.assertj assertj-core - 1.5.0 + 3.16.1 test @@ -143,16 +143,55 @@ commons-io 1.3.2 + + org.codehaus.groovy + groovy-all + 2.4.19 + + + org.jruby + jruby + 9.2.11.1 + org.jsoup jsoup 1.10.3 - org.mockito - mockito-all - 1.9.5 - test + org.python + jython + 2.7.2 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + net.sf.saxon + Saxon-HE + 10.1 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.5 + + + com.github.detro + phantomjsdriver + 1.2.0 + + + commons-cli + commons-cli + 1.4 + + + redis.clients + jedis + 2.9.3 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 66e455d34..4bc074da4 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -83,4 +83,4 @@ -
\ No newline at end of file + diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 7e949ca6f..bf7ff05d6 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -13,16 +13,14 @@ redis.clients jedis - 2.9.0 com.google.guava guava - 15.0 true - us.codecraft + ${project.groupId} webmagic-core ${project.version} @@ -32,4 +30,4 @@
- \ No newline at end of file + diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 072bb3fd5..44fee7c0d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -11,12 +11,12 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} - us.codecraft + ${project.groupId} webmagic-extension ${project.version} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 95f706ed5..ae1454b56 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -11,19 +11,17 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} net.sourceforge.htmlcleaner htmlcleaner - 2.5 net.sf.saxon Saxon-HE - 9.5.1-1 junit diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 94f08f02a..9f4219d6c 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,27 +16,23 @@ org.jruby jruby - 1.7.6 org.jetbrains.kotlin kotlin-stdlib ${kotlin.version} - org.codehaus.groovy groovy-all - 2.1.6 - org.python + + org.python jython - 2.5.3 commons-cli commons-cli - 1.2 junit @@ -44,7 +40,7 @@ test - us.codecraft + ${project.groupId} webmagic-core ${project.version} @@ -53,7 +49,7 @@ slf4j-log4j12 - us.codecraft + ${project.groupId} webmagic-extension ${project.version} diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index e88cce5c9..b5d096954 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -13,21 +13,16 @@ org.seleniumhq.selenium selenium-java - 3.0.0 - us.codecraft + ${project.groupId} webmagic-core ${project.version} com.github.detro phantomjsdriver - 1.2.0 - - - junit junit From 5d14efc50f5b81281819036c65bd7e81b04e10b0 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 14 Jun 2020 00:20:39 +0800 Subject: [PATCH 061/257] Serialize request URL only in FileCacheQueueScheduler. --- .../scheduler/FileCacheQueueScheduler.java | 37 +++++++++---------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 37310e6f0..fec3c1db9 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,14 +1,13 @@ package us.codecraft.webmagic.scheduler; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.SerializationUtils; -import org.apache.commons.lang3.math.NumberUtils; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; - -import java.io.*; +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.BlockingQueue; @@ -19,6 +18,13 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + /** * Store urls and cursor in files so that a Spider can resume the status when shutdown.
@@ -208,20 +214,11 @@ public int getTotalRequestsCount(Task task) { } protected String serializeRequest(Request request) { - String line = String.format("%1$s\t%2$s", request.getUrl(), - Base64.encodeBase64String(SerializationUtils.serialize(request))); - return line; + return request.getUrl(); } protected Request deserializeRequest(String line) { - Request request; - String[] sections = line.split("\t"); - if (sections.length >= 2) { - request = (Request) SerializationUtils.deserialize(Base64.decodeBase64(sections[1])); - } else { - request = new Request(sections[0]); - } - return request; + return new Request(line); } } From 2413366adb6df0f27b5f806f0228d5a41fb90935 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 15 Jun 2020 20:01:14 +0800 Subject: [PATCH 062/257] Format code, no actual code changed. --- .../us/codecraft/webmagic/proxy/Proxy.java | 136 +++++++++--------- .../webmagic/proxy/ProxyProvider.java | 2 +- .../webmagic/proxy/SimpleProxyProvider.java | 1 + 3 files changed, 70 insertions(+), 69 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index c5f100732..4b49557b5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,73 +1,73 @@ package us.codecraft.webmagic.proxy; -/** - * - */ - public class Proxy { - private String host; - private int port; - private String username; - private String password; - - public Proxy(String host, int port) { - this.host = host; - this.port = port; - } - - public Proxy(String host, int port, String username, String password) { - this.host = host; - this.port = port; - this.username = username; - this.password = password; - } - - public String getHost() { - return host; - } - - public int getPort() { - return port; - } - - public String getUsername() { - return username; - } - - public String getPassword() { - return password; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Proxy proxy = (Proxy) o; - - if (port != proxy.port) return false; - if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; - if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; - return password != null ? password.equals(proxy.password) : proxy.password == null; - } - - @Override - public int hashCode() { - int result = host != null ? host.hashCode() : 0; - result = 31 * result + port; - result = 31 * result + (username != null ? username.hashCode() : 0); - result = 31 * result + (password != null ? password.hashCode() : 0); - return result; - } - - @Override - public String toString() { - return "Proxy{" + - "host='" + host + '\'' + - ", port=" + port + - ", username='" + username + '\'' + - ", password='" + password + '\'' + - '}'; - } + private String host; + + private int port; + + private String username; + + private String password; + + public Proxy(String host, int port) { + this.host = host; + this.port = port; + } + + public Proxy(String host, int port, String username, String password) { + this.host = host; + this.port = port; + this.username = username; + this.password = password; + } + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public String getUsername() { + return username; + } + + public String getPassword() { + return password; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Proxy proxy = (Proxy) o; + + if (port != proxy.port) return false; + if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; + return password != null ? password.equals(proxy.password) : proxy.password == null; + } + + @Override + public int hashCode() { + int result = host != null ? host.hashCode() : 0; + result = 31 * result + port; + result = 31 * result + (username != null ? username.hashCode() : 0); + result = 31 * result + (password != null ? password.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "Proxy{" + + "host='" + host + '\'' + + ", port=" + port + + ", username='" + username + '\'' + + ", password='" + password + '\'' + + '}'; + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 5b61a993a..0cef4ed42 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -25,5 +25,5 @@ public interface ProxyProvider { * @return proxy */ Proxy getProxy(Task task); - + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index d8f47fe44..ddef6a88c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -59,4 +59,5 @@ private int incrForLoop() { } return p % size; } + } From 791323a5b0730f483a5a488dff149995f6722c75 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 16 Jun 2020 14:45:29 +0800 Subject: [PATCH 063/257] Add Proxy#scheme. --- .../downloader/HttpUriRequestConverter.java | 2 +- .../us/codecraft/webmagic/proxy/Proxy.java | 27 ++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 28a7ce5ea..4baaf4a4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -74,7 +74,7 @@ private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy p } if (proxy != null) { - requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); + requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 4b49557b5..fe3f78d97 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -2,6 +2,8 @@ public class Proxy { + private String scheme; + private String host; private int port; @@ -11,8 +13,13 @@ public class Proxy { private String password; public Proxy(String host, int port) { + this(host, port, null); + } + + public Proxy(String host, int port, String scheme) { this.host = host; this.port = port; + this.scheme = scheme; } public Proxy(String host, int port, String username, String password) { @@ -22,7 +29,15 @@ public Proxy(String host, int port, String username, String password) { this.password = password; } - public String getHost() { + public String getScheme() { + return scheme; + } + + public void setScheme(String scheme) { + this.scheme = scheme; + } + + public String getHost() { return host; } @@ -47,6 +62,7 @@ public boolean equals(Object o) { if (port != proxy.port) return false; if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false; if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; return password != null ? password.equals(proxy.password) : proxy.password == null; } @@ -55,6 +71,7 @@ public boolean equals(Object o) { public int hashCode() { int result = host != null ? host.hashCode() : 0; result = 31 * result + port; + result = 31 * result + (scheme != null ? scheme.hashCode() : 0); result = 31 * result + (username != null ? username.hashCode() : 0); result = 31 * result + (password != null ? password.hashCode() : 0); return result; @@ -62,12 +79,8 @@ public int hashCode() { @Override public String toString() { - return "Proxy{" + - "host='" + host + '\'' + - ", port=" + port + - ", username='" + username + '\'' + - ", password='" + password + '\'' + - '}'; + return String.format("Proxy{scheme='%1$s', host='%2$s', port=%3$d, username='%4$s', password='%5$s'}", + scheme, host, port, username, password); } } From 236e5ade44b24ac7db2e7821444db923a5f5da33 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 17 Jun 2020 11:19:37 +0800 Subject: [PATCH 064/257] Update Proxy#toString(). --- .../us/codecraft/webmagic/proxy/Proxy.java | 37 ++++++++++++++++++- .../codecraft/webmagic/proxy/ProxyTest.java | 16 +++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index fe3f78d97..179761cc6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,5 +1,13 @@ package us.codecraft.webmagic.proxy; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.lang3.StringUtils; + public class Proxy { private String scheme; @@ -53,6 +61,28 @@ public String getPassword() { return password; } + public URI toURI() throws URISyntaxException { + final StringBuilder userInfoBuffer = new StringBuilder(); + if (username != null) { + userInfoBuffer.append(urlencode(username)); + } + if (password != null) { + userInfoBuffer.append(":").append(urlencode(password)); + } + final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); + final URI uri = new URI(scheme, userInfo, host, port, null, null, null); + return uri; + } + + private String urlencode(String s) { + String enc = StandardCharsets.UTF_8.name(); + try { + return URLEncoder.encode(s, enc); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException(e); + } + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -79,8 +109,11 @@ public int hashCode() { @Override public String toString() { - return String.format("Proxy{scheme='%1$s', host='%2$s', port=%3$d, username='%4$s', password='%5$s'}", - scheme, host, port, username, password); + try { + return this.toURI().toString(); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e); + } } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 86af36720..894670131 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,11 +1,14 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.junit.BeforeClass; +import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; +import org.apache.http.HttpHost; +import org.junit.BeforeClass; +import org.junit.Test; + /** * @author yxssfxwzy@sina.com May 30, 2014 * @@ -42,4 +45,13 @@ public void run() { } } + @Test + public void testToString() { + assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); + assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); + assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); + assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); + assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); + } + } From 6d3f2d9b641b2c99f5b5b244d7ed86e4ee23ca13 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 24 Jun 2020 13:24:45 +0800 Subject: [PATCH 065/257] Wrap URISyntaxException as IllegalArgumentException for Proxy#toURI. --- .../java/us/codecraft/webmagic/proxy/Proxy.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 179761cc6..dffadba8b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -61,7 +61,7 @@ public String getPassword() { return password; } - public URI toURI() throws URISyntaxException { + public URI toURI() { final StringBuilder userInfoBuffer = new StringBuilder(); if (username != null) { userInfoBuffer.append(urlencode(username)); @@ -70,7 +70,12 @@ public URI toURI() throws URISyntaxException { userInfoBuffer.append(":").append(urlencode(password)); } final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); - final URI uri = new URI(scheme, userInfo, host, port, null, null, null); + URI uri; + try { + uri = new URI(scheme, userInfo, host, port, null, null, null); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e.getMessage(), e); + } return uri; } @@ -109,11 +114,7 @@ public int hashCode() { @Override public String toString() { - try { - return this.toURI().toString(); - } catch (URISyntaxException e) { - throw new IllegalArgumentException(e); - } + return this.toURI().toString(); } } From 48bc73fbfff3c9bc38493ac262a4aa61720dcd80 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 24 Jun 2020 13:43:16 +0800 Subject: [PATCH 066/257] New method Proxy#create. --- .../us/codecraft/webmagic/proxy/Proxy.java | 15 +++ .../codecraft/webmagic/proxy/ProxyTest.java | 116 ++++++++++++------ 2 files changed, 93 insertions(+), 38 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index dffadba8b..6554fab51 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -20,6 +20,21 @@ public class Proxy { private String password; + public static Proxy create(final URI uri) { + Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); + String userInfo = uri.getUserInfo(); + if (userInfo != null) { + String[] up = userInfo.split(":"); + if (up.length == 1) { + proxy.username = up[0].isEmpty() ? null : up[0]; + } else { + proxy.username = up[0].isEmpty() ? null : up[0]; + proxy.password = up[1].isEmpty() ? null : up[1]; + } + } + return proxy; + } + public Proxy(String host, int port) { this(host, port, null); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 894670131..8e4c82026 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,7 +1,9 @@ package us.codecraft.webmagic.proxy; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import java.net.URI; import java.util.ArrayList; import java.util.List; @@ -15,43 +17,81 @@ */ public class ProxyTest { - private static List httpProxyList = new ArrayList(); - - @BeforeClass - public static void before() { - // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", - // "0.0.0.4:0" }; - String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; - for (String line : source) { - httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); - } - } - - class Fetch extends Thread { - HttpHost hp; - - public Fetch(HttpHost hp) { - this.hp = hp; - } - - @Override - public void run() { - try { - System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); - sleep(500); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - @Test - public void testToString() { - assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); - assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); - assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); - assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); - assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); - } + private static List httpProxyList = new ArrayList(); + + @BeforeClass + public static void before() { + // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", + // "0.0.0.4:0" }; + String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; + for (String line : source) { + httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); + } + } + + class Fetch extends Thread { + HttpHost hp; + + public Fetch(HttpHost hp) { + this.hp = hp; + } + + @Override + public void run() { + try { + System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); + sleep(500); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + @Test + public void testCreate() { + Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("http://127.0.0.1:8080")); + assertEquals("http", proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + } + + @Test + public void testToString() { + assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); + assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); + assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); + assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); + assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); + } } From 9aab25f339b5aae1cb87bed7e1c30fae3bb5aef8 Mon Sep 17 00:00:00 2001 From: leeyazhou Date: Fri, 7 Aug 2020 16:36:32 +0800 Subject: [PATCH 067/257] build: manage plugin version & remove build WARNING ## use the new dependency of commons-io [WARNING] The artifact org.apache.commons:commons-io:jar:1.3.2 has been relocated to commons-io:commons-io:jar:1.3.2 ## manage plugin version of maven-jar-plugin and maven-deploy-plugin [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-core:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-extension:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-scripts:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ line 61, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-selenium:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-deploy-plugin is missing. @ line 34, column 12 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-saxon:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-deploy-plugin is missing. @ line 34, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-samples:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-parent:pom:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ line 263, column 21 [WARNING] [WARNING] It is highly recommended to fix these problems because they threaten the stability of your build. [WARNING] [WARNING] For this reason, future Maven versions might no longer support building such malformed projects. --- pom.xml | 9 +++++---- webmagic-core/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 ++ webmagic-selenium/pom.xml | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 1a5853ad0..d016d0a92 100644 --- a/pom.xml +++ b/pom.xml @@ -139,10 +139,10 @@ 3.2.2
- org.apache.commons - commons-io - 1.3.2 - + commons-io + commons-io + 2.7 +
org.codehaus.groovy groovy-all @@ -263,6 +263,7 @@ org.apache.maven.plugins maven-jar-plugin + 3.2.0 log4j.xml diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4bc074da4..44fb7fa4d 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -67,7 +67,7 @@ - org.apache.commons + commons-io commons-io diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index ae1454b56..da0c5f202 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -32,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index b5d096954..dfc4a1958 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -32,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true From e3b3b9afdd475ffcb7f82167062c00e9fa85b34b Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 9 Sep 2020 02:59:41 +0000 Subject: [PATCH 068/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMALIBABA-570967 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d016d0a92..088e153e6 100644 --- a/pom.xml +++ b/pom.xml @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.68 + 1.2.69 com.github.dreamhead From b4b1df85a089e3f0321036c2eb5230a7ad8aa4f2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 21 Sep 2020 17:47:06 +0800 Subject: [PATCH 069/257] Fix TLSv1.3. Maybe we should expose a API to allow user to use org.apache.http.ssl.SSLContextBuilder. Fixes #948. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index ee94581ad..80e0f1085 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -96,7 +96,7 @@ public X509Certificate[] getAcceptedIssuers() { }; - SSLContext sc = SSLContext.getInstance("SSLv3"); + SSLContext sc = SSLContext.getInstance("TLS"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; } From 7f737626b1abd9afa2c81a81708309f1e8dd00ee Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 10 Oct 2020 02:58:58 +0000 Subject: [PATCH 070/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEHTTPCOMPONENTS-1016906 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 088e153e6..d77a04a56 100644 --- a/pom.xml +++ b/pom.xml @@ -68,7 +68,7 @@ org.apache.httpcomponents httpclient - 4.5.12 + 4.5.13 org.apache.httpcomponents From 2223552aebad4557685f28dfe1da80808d8ef9fe Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 14 Oct 2020 00:46:32 +0000 Subject: [PATCH 071/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-JUNIT-1017047 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d77a04a56..887bf20f7 100644 --- a/pom.xml +++ b/pom.xml @@ -56,7 +56,7 @@ junit junit - 4.13 + 4.13.1 test From c87af365d6c83a3864dce94c88c8280605d008a1 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 24 Oct 2020 03:02:20 +0000 Subject: [PATCH 072/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMGOOGLEGUAVA-1015415 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 887bf20f7..fb1998df6 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ com.google.guava guava - 29.0-jre + 30.0-android com.jayway.jsonpath From 4b902270b4d6358fbbfeb56695e7ee89f967d38d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 27 Oct 2020 09:01:21 +0800 Subject: [PATCH 073/257] Bump version number from 0.7.3 to 0.7.4. --- README-zh.md | 4 ++-- README.md | 4 ++-- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README-zh.md b/README-zh.md index 65d5d1729..c5ebe15bf 100644 --- a/README-zh.md +++ b/README-zh.md @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.3 + 0.7.4 us.codecraft webmagic-extension - 0.7.3 + 0.7.4 ``` diff --git a/README.md b/README.md index 73cb48833..e5cd511d2 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.3 + 0.7.4 us.codecraft webmagic-extension - 0.7.3 + 0.7.4 ``` diff --git a/pom.xml b/pom.xml index fb1998df6..6341bc0b5 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.3 + 0.7.4 4.0.0 pom @@ -289,7 +289,7 @@ 3.2.0 UTF-8 - WebMagic 0.7.3 + WebMagic 0.7.4 en_US diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 44fb7fa4d..4b89cac10 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.3 + 0.7.4 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index bf7ff05d6..87900efda 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.3 + 0.7.4 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 44fee7c0d..3699fa66e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.4 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index da0c5f202..d3a57f256 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.4 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 9f4219d6c..121aafaf6 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.4 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index dfc4a1958..d0cb77c06 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.3 + 0.7.4 4.0.0 From 9a71f0ac924615d21882e1faa4bbda0c2e5eb7d7 Mon Sep 17 00:00:00 2001 From: yao Date: Tue, 15 Dec 2020 17:05:16 +0800 Subject: [PATCH 074/257] =?UTF-8?q?pageCount=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 62c989f1d..1a03bbfae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -24,6 +24,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAdder; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; @@ -102,7 +103,7 @@ public class Spider implements Runnable, Task { private List spiderListeners; - private final AtomicLong pageCount = new AtomicLong(0); + private final LongAdder pageCount = new LongAdder(); private Date startTime; @@ -323,7 +324,7 @@ public void run() { onError(request); logger.error("process request " + request + " error", e); } finally { - pageCount.incrementAndGet(); + pageCount.increment(); signalNewUrl(); } } @@ -335,7 +336,7 @@ public void run() { if (destroyWhenExit) { close(); } - logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); + logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.sumThenReset()); } protected void onError(Request request) { @@ -645,7 +646,7 @@ public boolean isSpawnUrl() { * @since 0.4.1 */ public long getPageCount() { - return pageCount.get(); + return pageCount.sum(); } /** From fc7ae9ce283748e839a8badcc873a99d3d946a94 Mon Sep 17 00:00:00 2001 From: itranlin <1010609304@qq.com> Date: Sat, 19 Dec 2020 17:59:52 +0800 Subject: [PATCH 075/257] =?UTF-8?q?=E5=AD=90=E4=BB=BB=E5=8A=A1=E5=8F=AF?= =?UTF-8?q?=E4=BB=A5=E4=BD=BF=E7=94=A8=E4=B8=8D=E5=90=8C=E7=9A=84=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E5=99=A8=E3=80=82=E3=80=82=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Request.java | 14 ++++++++++++++ .../main/java/us/codecraft/webmagic/Spider.java | 7 ++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 5c26d20dc..9fc286192 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; @@ -26,6 +27,11 @@ public class Request implements Serializable { private HttpRequestBody requestBody; + /** + * this req use this downloader + */ + private Downloader downloader; + /** * Store additional information in extras. */ @@ -175,6 +181,14 @@ public boolean isBinaryContent() { return binaryContent; } + public Downloader getDownloader() { + return downloader; + } + + public void setDownloader(Downloader downloader) { + this.downloader = downloader; + } + public Request setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 62c989f1d..886e74a92 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -401,7 +401,12 @@ public void test(String... urls) { } private void processRequest(Request request) { - Page page = downloader.download(request, this); + Page page; + if (null != request.getDownloader()){ + page = request.getDownloader().download(request,this); + }else { + page = downloader.download(request, this); + } if (page.isDownloadSuccess()){ onDownloadSuccess(request, page); } else { From ba69eba669d32fadbbe8b021b85b9b458d2db6aa Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:36:44 +0800 Subject: [PATCH 076/257] =?UTF-8?q?=E4=BB=A3=E7=90=86=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E7=9A=84=E4=BF=AE=E6=94=B9=EF=BC=8C=E6=8F=90=E4=BE=9B=E5=88=B7?= =?UTF-8?q?=E6=98=9F=E4=BB=A3=E7=90=86API=E3=80=82downloader=20=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E9=94=99=E8=AF=AF=E6=97=B6=EF=BC=8C=E6=8F=90=E4=BE=9B?= =?UTF-8?q?request,exception,proxyProvider=E4=B8=89=E4=B8=AA=E5=8F=82?= =?UTF-8?q?=E6=95=B0=EF=BC=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../codecraft/webmagic/downloader/AbstractDownloader.java | 3 ++- .../webmagic/downloader/HttpClientDownloader.java | 2 +- .../java/us/codecraft/webmagic/proxy/ProxyProvider.java | 7 +++++++ .../us/codecraft/webmagic/proxy/SimpleProxyProvider.java | 5 +++++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..05f5686af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.Html; /** @@ -38,7 +39,7 @@ public Html download(String url, String charset) { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable throwable, ProxyProvider proxyProvider) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 24889c88b..757cdd32d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public Page download(Request request, Task task) { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request,e,proxyProvider); return page; } finally { if (httpResponse != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed42..da3bec96a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -19,6 +19,13 @@ public interface ProxyProvider { */ void returnProxy(Proxy proxy, Page page, Task task); + /** + * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 + * + * @param task 下载任务 + */ + void refreshProxy(Task task); + /** * Get a proxy for task by some strategy. * @param task the download task diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a88c..fd80b3009 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -30,6 +30,11 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) { this.pointer = pointer; } + @Override + public void refreshProxy(Task task) { + + } + public static SimpleProxyProvider from(Proxy... proxies) { List proxiesTemp = new ArrayList(proxies.length); for (Proxy proxy : proxies) { From 4a6441e7c5923c14d889c7f54af0ef15e5a05cb9 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:52:25 +0800 Subject: [PATCH 077/257] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=87=BA=E7=8E=B0?= =?UTF-8?q?=E6=9F=90=E7=A7=8D=E5=BC=82=E5=B8=B8=E5=88=B7=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86=EF=BC=8C=E5=BC=82=E5=B8=B8=E5=8F=AF=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/downloader/HttpClientDownloader.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 757cdd32d..2dd340fc7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.function.Predicate; /** @@ -43,6 +44,14 @@ public class HttpClientDownloader extends AbstractDownloader { private boolean responseHeader = true; + private volatile boolean refreshProxyOnError = false; + + private Predicate throwablePredicate = t->false; + + public void setThrowablePredicate(Predicate predicate){ + this.throwablePredicate = predicate; + } + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } @@ -88,6 +97,9 @@ public Page download(Request request, Task task) { } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request,e,proxyProvider); + if(proxyProvider != null && refreshProxyOnError && throwablePredicate.test(e)){ + proxyProvider.refreshProxy(task); + } return page; } finally { if (httpResponse != null) { From 9cc5287743de9715ec3ac10a20636377be41d060 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 14:58:01 +0800 Subject: [PATCH 078/257] =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/HttpClientDownloader.java | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2dd340fc7..5684114e7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -32,24 +32,21 @@ */ public class HttpClientDownloader extends AbstractDownloader { - private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); - - private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - + private ProxyProvider proxyProvider; - private boolean responseHeader = true; + private final boolean responseHeader = true; - private volatile boolean refreshProxyOnError = false; - private Predicate throwablePredicate = t->false; + private Predicate refreshProxyOnError = t -> false; - public void setThrowablePredicate(Predicate predicate){ - this.throwablePredicate = predicate; + public void setRefreshProxyOnError(Predicate proxyOnError) { + this.refreshProxyOnError = refreshProxyOnError; } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { @@ -96,8 +93,8 @@ public Page download(Request request, Task task) { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request,e,proxyProvider); - if(proxyProvider != null && refreshProxyOnError && throwablePredicate.test(e)){ + onError(request, e, proxyProvider); + if (proxyProvider != null && refreshProxyOnError.test(e)) { proxyProvider.refreshProxy(task); } return page; @@ -122,7 +119,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } From 19465089c3ad254e6f35b96cbe707bc6dd33ec62 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 16:02:35 +0800 Subject: [PATCH 079/257] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E5=88=B7=E6=96=B0httpClient=EF=BC=8C=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E5=8F=AF=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=87=8D=E5=86=99getHttpCl?= =?UTF-8?q?ient=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/HttpClientDownloader.java | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 5684114e7..f9f8c829f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -21,6 +21,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Predicate; @@ -32,7 +33,7 @@ */ public class HttpClientDownloader extends AbstractDownloader { - private final Map httpClients = new HashMap(); + private final Map httpClients = new ConcurrentHashMap<>(); private final Logger logger = LoggerFactory.getLogger(getClass()); private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); @@ -45,6 +46,13 @@ public class HttpClientDownloader extends AbstractDownloader { private Predicate refreshProxyOnError = t -> false; + + private Predicate refreshClientOnError = t -> false; + + + public void setRefreshClientOnError(Predicate clientOnError){ + this.refreshClientOnError = clientOnError; + } public void setRefreshProxyOnError(Predicate proxyOnError) { this.refreshProxyOnError = refreshProxyOnError; } @@ -62,17 +70,8 @@ private CloseableHttpClient getHttpClient(Site site) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); - CloseableHttpClient httpClient = httpClients.get(domain); - if (httpClient == null) { - synchronized (this) { - httpClient = httpClients.get(domain); - if (httpClient == null) { - httpClient = httpClientGenerator.getClient(site); - httpClients.put(domain, httpClient); - } - } - } - return httpClient; + return httpClients.computeIfAbsent(domain,k->httpClientGenerator.getClient(site)); + } @Override @@ -97,6 +96,9 @@ public Page download(Request request, Task task) { if (proxyProvider != null && refreshProxyOnError.test(e)) { proxyProvider.refreshProxy(task); } + if(refreshClientOnError.test(e)) { + httpClients.remove(task.getSite().getDomain()); + } return page; } finally { if (httpResponse != null) { From 2e2a0fdf3e8e614d9a3af146dfa462d0e299ceb5 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 21 Dec 2020 18:08:55 +0800 Subject: [PATCH 080/257] =?UTF-8?q?=20Downloader=20=E6=8F=90=E4=BE=9B?= =?UTF-8?q?=E5=88=B7=E6=96=B0=E7=BB=84=E4=BB=B6=E7=9A=84api,=E6=96=B9?= =?UTF-8?q?=E4=BE=BF=E5=9C=A8spider=E4=B8=AD=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Site.java | 15 +++++++++++++++ .../main/java/us/codecraft/webmagic/Spider.java | 5 ++++- .../webmagic/downloader/Downloader.java | 10 +++++++--- .../downloader/HttpClientDownloader.java | 11 +++++++++++ .../webmagic/downloader/HttpClientGenerator.java | 16 +++++++++++----- .../codecraft/webmagic/utils/HttpConstant.java | 1 + .../java/us/codecraft/webmagic/SpiderTest.java | 5 +++++ .../downloader/MockGithubDownloader.java | 5 +++++ .../webmagic/downloader/PhantomJSDownloader.java | 7 ++++++- .../downloader/MockGithubDownloader.java | 4 ++++ .../downloader/selenium/SeleniumDownloader.java | 5 +++++ 11 files changed, 74 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 72cc7d058..bf603b3ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import com.sun.org.apache.regexp.internal.RE; import us.codecraft.webmagic.utils.HttpConstant; import java.util.*; @@ -35,8 +36,12 @@ public class Site { private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_REFRESH_CODE_SET = new HashSet<>(); + + private Set refreshCode = DEFAULT_REFRESH_CODE_SET; private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; + private Map headers = new HashMap(); private boolean useGzip = true; @@ -44,6 +49,7 @@ public class Site { private boolean disableCookieManagement = false; static { + DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } @@ -192,6 +198,15 @@ public Site setAcceptStatCode(Set acceptStatCode) { return this; } + public Site setRefreshCode(Set refreshCode){ + this.refreshCode = refreshCode; + return this; + } + public Set getRefreshCode(){ + return refreshCode; + + } + /** * get acceptStatCode * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 1a03bbfae..d1ad6a6f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -419,7 +419,10 @@ private void onDownloadSuccess(Request request, Page page) { pipeline.process(page.getResultItems(), this); } } - } else { + } else if(site.getRefreshCode().contains(page.getStatusCode())) { + logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); + downloader.refreshComponent(this); + }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index f7ced4932..50955012b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -18,14 +18,18 @@ public interface Downloader { * Downloads web pages and store in Page object. * * @param request request - * @param task task + * @param task task * @return page */ - public Page download(Request request, Task task); + Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. + * * @param threadNum number of threads */ - public void setThread(int threadNum); + void setThread(int threadNum); + + + void refreshComponent(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f9f8c829f..eed49fb4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -111,6 +111,17 @@ public Page download(Request request, Task task) { } } + + @Override + public void refreshComponent(Task task) { + if (proxyProvider != null ) { + proxyProvider.refreshProxy(task); + } + + httpClients.remove(task.getSite().getDomain()); + + } + @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f1085..1f20c5a58 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,13 +1,17 @@ package us.codecraft.webmagic.downloader; +import java.io.File; import java.io.IOException; import java.security.KeyManagementException; +import java.security.KeyStore; +import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.SSLContext; +import javax.net.ssl.SSLContextSpi; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; @@ -24,6 +28,7 @@ import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; @@ -32,6 +37,7 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; +import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,7 +75,7 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); @@ -77,8 +83,8 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { +// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override @@ -96,10 +102,10 @@ public X509Certificate[] getAcceptedIssuers() { }; - SSLContext sc = SSLContext.getInstance("TLS"); + SSLContext sc = SSLContext.getInstance("SSLv3"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2d6b8fe2a..bfacec351 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -28,6 +28,7 @@ public static abstract class Method { public static abstract class StatusCode { public static final int CODE_200 = 200; + public static final int FORBIDDEN = 403; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 4f4a2806d..6b9c4232b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -57,6 +57,11 @@ public Site getSite() { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { + @Override + public void refreshComponent(Task task) { + + } + @Override public Page download(Request request, Task task) { return new Page().setRawText(""); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c10..6d764a595 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -28,6 +28,11 @@ public Page download(Request request, Task task) { return page; } + @Override + public void refreshComponent(Task task) { + + } + @Override public void setThread(int threadNum) { } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0f..f3751d650 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -42,7 +42,12 @@ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + + @Override + public void refreshComponent(Task task) { + + } + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 91e3698cf..774469292 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -9,6 +9,10 @@
  * @author code4crafter@gmail.com
  */
 public class MockGithubDownloader implements Downloader{
+    @Override
+    public void refreshComponent(Task task) {
+
+    }
 
     private String html = "\n" +
             "\n" +
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index cce293fc9..11b235620 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -59,6 +59,11 @@ public SeleniumDownloader() {
 		// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
 	}
 
+	@Override
+	public void refreshComponent(Task task) {
+
+	}
+
 	/**
 	 * set sleep time to wait until load success
 	 *

From 0aa2c3949d29e4c02c199eb30c7adae8f244e1ee Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 22 Dec 2020 18:19:37 +0800
Subject: [PATCH 081/257] =?UTF-8?q?=20=E5=88=B7=E6=96=B0=E4=BB=A3=E7=90=86?=
 =?UTF-8?q?api=E9=87=8D=E6=9E=84=EF=BC=8C=E9=9C=80=E8=A6=81=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9B=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C=E5=A6=82=E6=9E=9C?=
 =?UTF-8?q?=E4=BE=9D=E7=84=B6=E6=98=AF=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C?=
 =?UTF-8?q?=E6=89=8D=E8=BF=9B=E8=A1=8C=E5=88=B7=E6=96=B0=EF=BC=8C=E9=98=B2?=
 =?UTF-8?q?=E6=AD=A2=E5=BA=94=E5=BB=B6=E8=BF=9F=E5=93=8D=E5=BA=94=E9=80=A0?=
 =?UTF-8?q?=E6=88=90=E7=9A=84=E8=BF=87=E5=BA=A6=E5=88=B7=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/main/java/us/codecraft/webmagic/Spider.java |  7 ++++++-
 .../webmagic/downloader/HttpClientDownloader.java   |  6 +++---
 .../webmagic/downloader/HttpClientGenerator.java    |  1 +
 .../us/codecraft/webmagic/proxy/ProxyProvider.java  | 13 ++++++++++++-
 .../webmagic/proxy/SimpleProxyProvider.java         |  7 ++++++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index d1ad6a6f0..bc07651a2 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -421,7 +421,7 @@ private void onDownloadSuccess(Request request, Page page) {
             }
         } else if(site.getRefreshCode().contains(page.getStatusCode())) {
             logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
-            downloader.refreshComponent(this);
+            failHandler(request);
         }else {
             logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
         }
@@ -430,6 +430,11 @@ private void onDownloadSuccess(Request request, Page page) {
     }
 
     private void onDownloaderFail(Request request) {
+       failHandler(request);
+    }
+
+    private void failHandler(Request request){
+        downloader.refreshComponent(this);
         if (site.getCycleRetryTimes() == 0) {
             sleep(site.getSleepTime());
         } else {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index eed49fb4a..ace817554 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -54,7 +54,7 @@ public void setRefreshClientOnError(Predicate clientOnError){
         this.refreshClientOnError = clientOnError;
     }
     public void setRefreshProxyOnError(Predicate proxyOnError) {
-        this.refreshProxyOnError = refreshProxyOnError;
+        this.refreshProxyOnError = proxyOnError;
     }
 
     public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
@@ -94,7 +94,7 @@ public Page download(Request request, Task task) {
             logger.warn("download page {} error", request.getUrl(), e);
             onError(request, e, proxyProvider);
             if (proxyProvider != null  && refreshProxyOnError.test(e)) {
-                proxyProvider.refreshProxy(task);
+                proxyProvider.refreshProxy(task,proxy);
             }
             if(refreshClientOnError.test(e)) {
                 httpClients.remove(task.getSite().getDomain());
@@ -115,7 +115,7 @@ public Page download(Request request, Task task) {
     @Override
     public void refreshComponent(Task task) {
         if (proxyProvider != null ) {
-            proxyProvider.refreshProxy(task);
+            proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task));
         }
 
             httpClients.remove(task.getSite().getDomain());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index 1f20c5a58..2d27b79a2 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -143,6 +143,7 @@ public void process(
         SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
         socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
         socketConfigBuilder.setSoTimeout(site.getTimeOut());
+
         SocketConfig socketConfig = socketConfigBuilder.build();
         httpClientBuilder.setDefaultSocketConfig(socketConfig);
         connectionManager.setDefaultSocketConfig(socketConfig);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index da3bec96a..b4e7b484d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -23,8 +23,19 @@ public interface ProxyProvider {
      *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
      *
      * @param task  下载任务
+     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
      */
-    void refreshProxy(Task task);
+    void refreshProxy(Task task,Proxy proxy);
+
+
+    /**
+     *
+     * 获取当前正在提供的代理
+     *
+     * @param task
+     * @return
+     */
+    Proxy getCurrentProxy(Task task);
 
     /**
      * Get a proxy for task by some strategy.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index fd80b3009..8ad9ce7b1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -31,7 +31,12 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) {
     }
 
     @Override
-    public void refreshProxy(Task task) {
+    public Proxy getCurrentProxy(Task task) {
+        return null;
+    }
+
+    @Override
+    public void refreshProxy(Task task,Proxy proxy) {
 
     }
 

From 33906e36f48588f8d1a44331d1a21fbcd3a5f9d7 Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 29 Dec 2020 16:18:43 +0800
Subject: [PATCH 082/257] =?UTF-8?q?=20=E4=BB=A3=E7=90=86=E5=8A=9F=E8=83=BD?=
 =?UTF-8?q?=E6=89=A9=E5=B1=95=EF=BC=8C=E5=AF=B9=E5=8E=9F=E4=BB=A3=E7=90=86?=
 =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=95=86=E8=BF=9B=E8=A1=8C=E6=8B=86=E5=88=86?=
 =?UTF-8?q?=EF=BC=8C=E5=8A=A0=E5=85=A5lombok?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 webmagic-core/pom.xml                         |   9 +-
 .../java/us/codecraft/webmagic/Spider.java    |   2 +-
 .../downloader/HttpClientDownloader.java      |  15 +-
 .../AbstractRefreshableProxyProvider.java     | 135 ++++++++++++++++++
 .../webmagic/proxy/ExpirableProxy.java        |  34 +++++
 .../us/codecraft/webmagic/proxy/Proxy.java    |  66 ++++-----
 .../webmagic/proxy/ProxyProvider.java         |  27 ----
 .../proxy/RefreshableProxyProvider.java       |  30 ++++
 .../proxy/ReturnableProxyProvider.java        |  22 +++
 .../webmagic/proxy/SimpleProxyProvider.java   |  15 --
 10 files changed, 273 insertions(+), 82 deletions(-)
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java

diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 4b89cac10..0cea05fe8 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -1,5 +1,6 @@
 
-
+
     
         us.codecraft
         webmagic-parent
@@ -24,6 +25,12 @@
             org.apache.commons
             commons-lang3
         
+        
+            org.projectlombok
+            lombok
+            1.18.10
+            provided
+        
 
         
             us.codecraft
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index bc07651a2..dfca9dd9d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -421,6 +421,7 @@ private void onDownloadSuccess(Request request, Page page) {
             }
         } else if(site.getRefreshCode().contains(page.getStatusCode())) {
             logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode());
+            downloader.refreshComponent(this);
             failHandler(request);
         }else {
             logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
@@ -434,7 +435,6 @@ private void onDownloaderFail(Request request) {
     }
 
     private void failHandler(Request request){
-        downloader.refreshComponent(this);
         if (site.getCycleRetryTimes() == 0) {
             sleep(site.getSleepTime());
         } else {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index ace817554..8e8676d0f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -13,6 +13,8 @@
 import us.codecraft.webmagic.Task;
 import us.codecraft.webmagic.proxy.Proxy;
 import us.codecraft.webmagic.proxy.ProxyProvider;
+import us.codecraft.webmagic.proxy.RefreshableProxyProvider;
+import us.codecraft.webmagic.proxy.ReturnableProxyProvider;
 import us.codecraft.webmagic.selector.PlainText;
 import us.codecraft.webmagic.utils.CharsetUtils;
 import us.codecraft.webmagic.utils.HttpClientUtils;
@@ -93,8 +95,8 @@ public Page download(Request request, Task task) {
         } catch (IOException e) {
             logger.warn("download page {} error", request.getUrl(), e);
             onError(request, e, proxyProvider);
-            if (proxyProvider != null  && refreshProxyOnError.test(e)) {
-                proxyProvider.refreshProxy(task,proxy);
+            if (proxyProvider != null && proxy != null && proxyProvider instanceof RefreshableProxyProvider && refreshProxyOnError.test(e)) {
+                ((RefreshableProxyProvider)proxyProvider).refreshProxy(task,proxy);
             }
             if(refreshClientOnError.test(e)) {
                 httpClients.remove(task.getSite().getDomain());
@@ -105,8 +107,9 @@ public Page download(Request request, Task task) {
                 //ensure the connection is released back to pool
                 EntityUtils.consumeQuietly(httpResponse.getEntity());
             }
-            if (proxyProvider != null && proxy != null) {
-                proxyProvider.returnProxy(proxy, page, task);
+            if (proxyProvider != null && proxy != null && proxyProvider instanceof ReturnableProxyProvider) {
+                ((ReturnableProxyProvider) proxyProvider).returnProxy(proxy, page, task);
+
             }
         }
     }
@@ -114,8 +117,8 @@ public Page download(Request request, Task task) {
 
     @Override
     public void refreshComponent(Task task) {
-        if (proxyProvider != null ) {
-            proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task));
+        if (proxyProvider != null && proxyProvider instanceof RefreshableProxyProvider) {
+            ((RefreshableProxyProvider) proxyProvider).refreshProxy(task, ((RefreshableProxyProvider) proxyProvider).getCurrentProxy(task));
         }
 
             httpClients.remove(task.getSite().getDomain());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
new file mode 100644
index 000000000..8e7cb08ac
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java
@@ -0,0 +1,135 @@
+package us.codecraft.webmagic.proxy;
+
+import lombok.extern.slf4j.Slf4j;
+import us.codecraft.webmagic.Task;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.Comparator;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.atomic.LongAdder;
+
+/**
+ * @author yaoqiang
+ * 可刷新的代理提供商抽象实现
+ */
+@Slf4j
+public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider {
+
+    private final LongAdder totalGet = new LongAdder();
+
+    private final LongAdder canUse = new LongAdder();
+
+    private final AtomicReference> usedProxyCache = new AtomicReference<>();
+
+    private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime));
+
+    private final int maxHostNum;
+
+    public AbstractRefreshableProxyProvider(int maxHostNum) {
+        this.maxHostNum = maxHostNum;
+    }
+
+    protected void doPut(ExpirableProxy expirableProxy) {
+        synchronized (ipQueue) {
+            if (ipQueue.size() <= maxHostNum) {
+                ipQueue.put(expirableProxy);
+            }
+        }
+    }
+
+    @Override
+    public void refreshProxy(Task task, Proxy proxy) {
+        if (proxy != null) {
+            FutureTask proxyFutureTask = usedProxyCache.get();
+            Proxy currentProxy = getCurrentProxy(task);
+            // 如果在出错到这里的过程中,usedProxyCache被更新过,proxy 就不可能相等,如果依然相等,说明没有更新过
+            // 可能没有使用代理的情况
+            if (proxy.equals(currentProxy)) {
+                // 如果此时依然没有更新过,就设置为空
+                usedProxyCache.compareAndSet(proxyFutureTask, null);
+            }
+        }
+    }
+
+    @Override
+    public Proxy getCurrentProxy(Task task) {
+        FutureTask cache = usedProxyCache.get();
+        Proxy currentProxy = null;
+        try {
+            if (cache != null)
+                currentProxy = cache.get(5, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+            e.printStackTrace();
+            log.error(e.getMessage(), e);
+            Thread.currentThread().interrupt();
+        } catch (ExecutionException e) {
+            e.printStackTrace();
+            log.error(e.getCause().getMessage(), e);
+        } catch (TimeoutException e) {
+            log.error(e.getMessage(), e);
+            e.printStackTrace();
+        }
+        return currentProxy;
+    }
+
+
+    private FutureTask buildCacheTask() {
+        return new FutureTask<>(this::doGet);
+    }
+
+
+    /**
+     * 特别注意,防止活锁,集cache中总是抛出异常,那么将无限循环,无限报错
+     *
+     * @param task 下载任务
+     * @return 返回代理
+     */
+    @Override
+    public Proxy getProxy(Task task) {
+        while (!Thread.currentThread().isInterrupted()) {
+            FutureTask cache = usedProxyCache.get();
+            if (cache == null) {
+                FutureTask futureTask = buildCacheTask();
+                if (usedProxyCache.compareAndSet(null, futureTask)) {
+                    cache = futureTask;
+                    futureTask.run();
+                } else {
+                    // 交换失败,需要更新到最新数据
+                    cache = usedProxyCache.get();
+                }
+            }
+            try {
+                if (cache != null) {
+
+                    ExpirableProxy proxy = (ExpirableProxy) cache.get(5, TimeUnit.SECONDS);
+                    if (!proxy.isExpire())
+                        return proxy;
+                }
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                log.error(e.getMessage(), e);
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (ExecutionException e) {
+                log.error(e.getMessage(), e);
+                usedProxyCache.compareAndSet(cache, null);
+            } catch (TimeoutException e) {
+                log.error(e.getMessage(), e);
+            }
+        }
+        return null;
+    }
+
+    private Proxy doGet() throws InterruptedException {
+        ExpirableProxy proxy;
+        do {
+            proxy = ipQueue.take();
+        } while (proxy.isExpire());
+        log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue());
+        return proxy;
+    }
+
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
new file mode 100644
index 000000000..f23caaf5f
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.proxy;
+
+import org.apache.http.annotation.Contract;
+import org.apache.http.annotation.ThreadingBehavior;
+
+import java.time.LocalDateTime;
+import java.time.temporal.ChronoUnit;
+
+/**
+ * @author yaoqiang
+ *
+ * 可以过期的代理
+ */
+@Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL)
+public class ExpirableProxy extends Proxy {
+    private final int ttl;
+    private final LocalDateTime expireTime;
+
+
+    public ExpirableProxy(String host, int port, int ttl, ChronoUnit chronoUnit) {
+        super(host, port);
+        this.ttl = ttl;
+        this.expireTime = LocalDateTime.now().plus(ttl, chronoUnit);
+
+    }
+
+    public boolean isExpire() {
+        return LocalDateTime.now().isAfter(expireTime);
+    }
+    public LocalDateTime getExpireTime(){
+        return expireTime;
+    }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index 6554fab51..ffae4be4b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -6,33 +6,30 @@
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 
+import jdk.nashorn.internal.ir.annotations.Immutable;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.http.annotation.Contract;
+import org.apache.http.annotation.ThreadingBehavior;
 
+@Contract(threading = ThreadingBehavior.IMMUTABLE)
 public class Proxy {
 
-    private String scheme;
+    private final String scheme;
 
-    private String host;
+    private final String host;
 
-    private int port;
+    private final int port;
 
-    private String username;
+    private final String username;
 
-    private String password;
+    private final String password;
 
-    public static Proxy create(final URI uri) {
-        Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
-        String userInfo = uri.getUserInfo();
-        if (userInfo != null) {
-            String[] up = userInfo.split(":");
-            if (up.length == 1) {
-                proxy.username = up[0].isEmpty() ? null : up[0];
-            } else {
-                proxy.username = up[0].isEmpty() ? null : up[0];
-                proxy.password = up[1].isEmpty() ? null : up[1];
-            }
-        }
-        return proxy;
+    public Proxy(String host, int port, String scheme, String username, String password) {
+        this.scheme = scheme;
+        this.host = host;
+        this.port = port;
+        this.username = username;
+        this.password = password;
     }
 
     public Proxy(String host, int port) {
@@ -40,27 +37,30 @@ public Proxy(String host, int port) {
     }
 
     public Proxy(String host, int port, String scheme) {
-        this.host = host;
-        this.port = port;
-        this.scheme = scheme;
+        this(host, port, scheme, null, null);
     }
 
     public Proxy(String host, int port, String username, String password) {
-        this.host = host;
-        this.port = port;
-        this.username = username;
-        this.password = password;
+        this(host, port, null, username, password);
     }
 
-    public String getScheme() {
-        return scheme;
-    }
-
-    public void setScheme(String scheme) {
-        this.scheme = scheme;
+    public static Proxy create(final URI uri) {
+        String userInfo = uri.getUserInfo();
+        String username = null;
+        String password = null;
+        if (userInfo != null) {
+            String[] up = userInfo.split(":");
+            if (up.length == 1) {
+                username = up[0].isEmpty() ? null : up[0];
+            } else {
+                username = up[0].isEmpty() ? null : up[0];
+                password = up[1].isEmpty() ? null : up[1];
+            }
+        }
+        return new Proxy(uri.getHost(), uri.getPort(), uri.getScheme(), username, password);
     }
 
-	public String getHost() {
+    public String getHost() {
         return host;
     }
 
@@ -68,6 +68,8 @@ public int getPort() {
         return port;
     }
 
+    public String getScheme(){return scheme;}
+
     public String getUsername() {
         return username;
     }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index b4e7b484d..b567d582f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -1,6 +1,5 @@
 package us.codecraft.webmagic.proxy;
 
-import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Task;
 
 /**
@@ -10,32 +9,6 @@
  */
 public interface ProxyProvider {
 
-    /**
-     *
-     * Return proxy to Provider when complete a download.
-     * @param proxy the proxy config contains host,port and identify info
-     * @param page the download result
-     * @param task the download task
-     */
-    void returnProxy(Proxy proxy, Page page, Task task);
-
-    /**
-     *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
-     *
-     * @param task  下载任务
-     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
-     */
-    void refreshProxy(Task task,Proxy proxy);
-
-
-    /**
-     *
-     * 获取当前正在提供的代理
-     *
-     * @param task
-     * @return
-     */
-    Proxy getCurrentProxy(Task task);
 
     /**
      * Get a proxy for task by some strategy.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
new file mode 100644
index 000000000..77e1ce2c4
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java
@@ -0,0 +1,30 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author yaoqiang
+ *
+ * 可以手动刷新的代理供应商
+ */
+public interface RefreshableProxyProvider extends ProxyProvider{
+
+    /**
+     *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
+     *
+     * @param task  爬虫任务
+     * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行
+     */
+    void refreshProxy(Task task,Proxy proxy);
+
+
+    /**
+     *
+     * 获取当前正在提供的代理
+     *
+     * @param task 工作中的爬虫任务
+     * @return 获取当前正在使用的代理
+     */
+    Proxy getCurrentProxy(Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java
new file mode 100644
index 000000000..43b49fc3e
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author yaoqiang
+ *
+ * 可归还的代理提供商,代理被取出后,实用完成,可以归还给代理提供商
+ */
+public interface ReturnableProxyProvider {
+
+    /**
+     *
+     * Return proxy to Provider when complete a download.
+     * @param proxy the proxy config contains host,port and identify info
+     * @param page the download result
+     * @param task the download task
+     */
+    void returnProxy(Proxy proxy, Page page, Task task);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index 8ad9ce7b1..fda3e2384 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -1,6 +1,5 @@
 package us.codecraft.webmagic.proxy;
 
-import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Task;
 
 import java.util.ArrayList;
@@ -30,15 +29,6 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) {
         this.pointer = pointer;
     }
 
-    @Override
-    public Proxy getCurrentProxy(Task task) {
-        return null;
-    }
-
-    @Override
-    public void refreshProxy(Task task,Proxy proxy) {
-
-    }
 
     public static SimpleProxyProvider from(Proxy... proxies) {
         List proxiesTemp = new ArrayList(proxies.length);
@@ -48,11 +38,6 @@ public static SimpleProxyProvider from(Proxy... proxies) {
         return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp));
     }
 
-    @Override
-    public void returnProxy(Proxy proxy, Page page, Task task) {
-        //Donothing
-    }
-
     @Override
     public Proxy getProxy(Task task) {
         return proxies.get(incrForLoop());

From f68795d7dd1ad3202a59ab9d49030065992001b1 Mon Sep 17 00:00:00 2001
From: yao 
Date: Tue, 29 Dec 2020 16:54:38 +0800
Subject: [PATCH 083/257] =?UTF-8?q?=20bug=E4=BF=AE=E6=94=B9=EF=BC=8C?=
 =?UTF-8?q?=E5=AF=B9=E7=BB=93=E6=9E=9C=E6=8F=90=E4=BE=9B=E7=BC=93=E5=AD=98?=
 =?UTF-8?q?=E8=83=BD=E5=8A=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../webmagic/pipeline/CachePipeline.java      | 18 ++++
 .../pipeline/CloseableCachePipeline.java      | 87 +++++++++++++++++++
 .../AbstractRefreshableProxyProvider.java     | 20 +----
 .../webmagic/proxy/ExpirableProxy.java        |  2 +
 4 files changed, 111 insertions(+), 16 deletions(-)
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
 create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
new file mode 100644
index 000000000..e0acd9095
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java
@@ -0,0 +1,18 @@
+package us.codecraft.webmagic.pipeline;
+
+import java.util.Collection;
+
+/**
+ * @author yaoqiang
+ *
+ * 为pipeline提供缓存能力
+ * 在某个时机执行批处理任务
+ */
+public interface CachePipeline extends Pipeline{
+
+    /**
+     * @param collection  缓存批处理
+     *
+     */
+    void process(Collection collection);
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java
new file mode 100644
index 000000000..4ba433e32
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java
@@ -0,0 +1,87 @@
+package us.codecraft.webmagic.pipeline;
+
+import lombok.extern.slf4j.Slf4j;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+
+/**
+ * @author yaoqiang
+ * 提供关闭时刷新能力
+ * 

+ *

+ * 不负责创建 {@link ExecutorService},如果需要异步执行,那么需要从外界传入,由外界自己管理 {@link ExecutorService}生命周期 + * @see ExecutorService + */ +@Slf4j +public abstract class CloseableCachePipeline implements CachePipeline, Closeable { + + + private final BlockingQueue cache; + + private final ExecutorService executorService; + + public CloseableCachePipeline(int max, ExecutorService executorService) { + this.cache = new ArrayBlockingQueue<>(max, false); + this.executorService = executorService; + } + + public CloseableCachePipeline(int max) { + this(max, null); + } + + /** + * @param resultItems 接收到的信息 + * @param task 执行的任务 + */ + @Override + public final void process(ResultItems resultItems, Task task) { + try { + cache.put(resultItems); + } catch (InterruptedException e) { + e.printStackTrace(); + Thread.currentThread().interrupt(); + log.error(e.getMessage(), e); + } + if (cache.remainingCapacity() == 0) { + // set 中的resultItem 使用权依然传递了出去,cache的使用全保留,考虑到后面也用不上 resultItem,所以发布出去问题也不大 + // temp的修改权限被发布,理由同上,想添加,删除都可以,反正以后也用不上了; + Set temp = new HashSet<>(cache); + if (executorService != null && !executorService.isShutdown()) { + executorService.execute(() -> process(temp, task)); + } else { + process(temp, task); + } + cache.clear(); + } + + } + + protected abstract void process(Collection resultItems, Task task); + + private synchronized void flush(Collection resultItems) { + process(resultItems, null); + + } + + /** + * 结合源码,实现关闭时处理剩余的缓存,直接交由主线程处理 + * + * @throws IOException 关闭可能出现异常,由上层处理 + */ + @Override + public final void close() throws IOException { + if (!cache.isEmpty()) { + flush(cache); + cache.clear(); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java index 8e7cb08ac..781553c8c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java @@ -3,12 +3,9 @@ import lombok.extern.slf4j.Slf4j; import us.codecraft.webmagic.Task; -import java.math.BigDecimal; -import java.math.RoundingMode; import java.util.Comparator; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; -import java.util.concurrent.atomic.LongAdder; /** * @author yaoqiang @@ -17,26 +14,18 @@ @Slf4j public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider { - private final LongAdder totalGet = new LongAdder(); - - private final LongAdder canUse = new LongAdder(); private final AtomicReference> usedProxyCache = new AtomicReference<>(); private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime)); - private final int maxHostNum; - public AbstractRefreshableProxyProvider(int maxHostNum) { - this.maxHostNum = maxHostNum; + protected void doPut(ExpirableProxy expirableProxy) { + ipQueue.put(expirableProxy); } - protected void doPut(ExpirableProxy expirableProxy) { - synchronized (ipQueue) { - if (ipQueue.size() <= maxHostNum) { - ipQueue.put(expirableProxy); - } - } + protected int hostSize() { + return ipQueue.size(); } @Override @@ -127,7 +116,6 @@ private Proxy doGet() throws InterruptedException { do { proxy = ipQueue.take(); } while (proxy.isExpire()); - log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue()); return proxy; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java index f23caaf5f..d22b7dc9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.proxy; +import lombok.Getter; import org.apache.http.annotation.Contract; import org.apache.http.annotation.ThreadingBehavior; @@ -13,6 +14,7 @@ */ @Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL) public class ExpirableProxy extends Proxy { + @Getter private final int ttl; private final LocalDateTime expireTime; From 328c3e0d7d9dfc1da411231745fcb4cf30ce92f5 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 2 Jan 2021 19:41:05 +0800 Subject: [PATCH 084/257] Remove useless imports to fix build. --- .../src/main/java/us/codecraft/webmagic/Site.java | 10 +++++++--- .../main/java/us/codecraft/webmagic/proxy/Proxy.java | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index bf603b3ce..edbf934da 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic; -import com.sun.org.apache.regexp.internal.RE; -import us.codecraft.webmagic.utils.HttpConstant; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.UUID; -import java.util.*; +import us.codecraft.webmagic.utils.HttpConstant; /** * Object contains setting for crawler.
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index ffae4be4b..878f8ae33 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -6,7 +6,6 @@ import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -import jdk.nashorn.internal.ir.annotations.Immutable; import org.apache.commons.lang3.StringUtils; import org.apache.http.annotation.Contract; import org.apache.http.annotation.ThreadingBehavior; From aabc5584b85f3231c8b3cd2f7e9bd944a2cb10f3 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 2 Jan 2021 20:13:53 +0800 Subject: [PATCH 085/257] =?UTF-8?q?Revert=20"=20bug=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=EF=BC=8C=E5=AF=B9=E7=BB=93=E6=9E=9C=E6=8F=90=E4=BE=9B=E7=BC=93?= =?UTF-8?q?=E5=AD=98=E8=83=BD=E5=8A=9B"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f68795d7dd1ad3202a59ab9d49030065992001b1. --- .../webmagic/pipeline/CachePipeline.java | 18 ---- .../pipeline/CloseableCachePipeline.java | 87 ------------------- .../AbstractRefreshableProxyProvider.java | 20 ++++- .../webmagic/proxy/ExpirableProxy.java | 2 - 4 files changed, 16 insertions(+), 111 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java deleted file mode 100644 index e0acd9095..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CachePipeline.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import java.util.Collection; - -/** - * @author yaoqiang - * - * 为pipeline提供缓存能力 - * 在某个时机执行批处理任务 - */ -public interface CachePipeline extends Pipeline{ - - /** - * @param collection 缓存批处理 - * - */ - void process(Collection collection); -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java deleted file mode 100644 index 4ba433e32..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CloseableCachePipeline.java +++ /dev/null @@ -1,87 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import lombok.extern.slf4j.Slf4j; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ExecutorService; - -/** - * @author yaoqiang - * 提供关闭时刷新能力 - *

- *

- * 不负责创建 {@link ExecutorService},如果需要异步执行,那么需要从外界传入,由外界自己管理 {@link ExecutorService}生命周期 - * @see ExecutorService - */ -@Slf4j -public abstract class CloseableCachePipeline implements CachePipeline, Closeable { - - - private final BlockingQueue cache; - - private final ExecutorService executorService; - - public CloseableCachePipeline(int max, ExecutorService executorService) { - this.cache = new ArrayBlockingQueue<>(max, false); - this.executorService = executorService; - } - - public CloseableCachePipeline(int max) { - this(max, null); - } - - /** - * @param resultItems 接收到的信息 - * @param task 执行的任务 - */ - @Override - public final void process(ResultItems resultItems, Task task) { - try { - cache.put(resultItems); - } catch (InterruptedException e) { - e.printStackTrace(); - Thread.currentThread().interrupt(); - log.error(e.getMessage(), e); - } - if (cache.remainingCapacity() == 0) { - // set 中的resultItem 使用权依然传递了出去,cache的使用全保留,考虑到后面也用不上 resultItem,所以发布出去问题也不大 - // temp的修改权限被发布,理由同上,想添加,删除都可以,反正以后也用不上了; - Set temp = new HashSet<>(cache); - if (executorService != null && !executorService.isShutdown()) { - executorService.execute(() -> process(temp, task)); - } else { - process(temp, task); - } - cache.clear(); - } - - } - - protected abstract void process(Collection resultItems, Task task); - - private synchronized void flush(Collection resultItems) { - process(resultItems, null); - - } - - /** - * 结合源码,实现关闭时处理剩余的缓存,直接交由主线程处理 - * - * @throws IOException 关闭可能出现异常,由上层处理 - */ - @Override - public final void close() throws IOException { - if (!cache.isEmpty()) { - flush(cache); - cache.clear(); - } - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java index 781553c8c..8e7cb08ac 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java @@ -3,9 +3,12 @@ import lombok.extern.slf4j.Slf4j; import us.codecraft.webmagic.Task; +import java.math.BigDecimal; +import java.math.RoundingMode; import java.util.Comparator; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.atomic.LongAdder; /** * @author yaoqiang @@ -14,18 +17,26 @@ @Slf4j public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider { + private final LongAdder totalGet = new LongAdder(); + + private final LongAdder canUse = new LongAdder(); private final AtomicReference> usedProxyCache = new AtomicReference<>(); private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime)); + private final int maxHostNum; - protected void doPut(ExpirableProxy expirableProxy) { - ipQueue.put(expirableProxy); + public AbstractRefreshableProxyProvider(int maxHostNum) { + this.maxHostNum = maxHostNum; } - protected int hostSize() { - return ipQueue.size(); + protected void doPut(ExpirableProxy expirableProxy) { + synchronized (ipQueue) { + if (ipQueue.size() <= maxHostNum) { + ipQueue.put(expirableProxy); + } + } } @Override @@ -116,6 +127,7 @@ private Proxy doGet() throws InterruptedException { do { proxy = ipQueue.take(); } while (proxy.isExpire()); + log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue()); return proxy; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java index d22b7dc9a..f23caaf5f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.proxy; -import lombok.Getter; import org.apache.http.annotation.Contract; import org.apache.http.annotation.ThreadingBehavior; @@ -14,7 +13,6 @@ */ @Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL) public class ExpirableProxy extends Proxy { - @Getter private final int ttl; private final LocalDateTime expireTime; From 3f756c93254c41b59bd6863f70e003dac95cc488 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 2 Jan 2021 20:14:01 +0800 Subject: [PATCH 086/257] =?UTF-8?q?Revert=20"=20=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E6=89=A9=E5=B1=95=EF=BC=8C=E5=AF=B9=E5=8E=9F?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E6=8F=90=E4=BE=9B=E5=95=86=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E6=8B=86=E5=88=86=EF=BC=8C=E5=8A=A0=E5=85=A5lombok"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 33906e36f48588f8d1a44331d1a21fbcd3a5f9d7. --- webmagic-core/pom.xml | 9 +- .../java/us/codecraft/webmagic/Spider.java | 2 +- .../downloader/HttpClientDownloader.java | 15 +- .../AbstractRefreshableProxyProvider.java | 135 ------------------ .../webmagic/proxy/ExpirableProxy.java | 34 ----- .../us/codecraft/webmagic/proxy/Proxy.java | 65 +++++---- .../webmagic/proxy/ProxyProvider.java | 27 ++++ .../proxy/RefreshableProxyProvider.java | 30 ---- .../proxy/ReturnableProxyProvider.java | 22 --- .../webmagic/proxy/SimpleProxyProvider.java | 15 ++ 10 files changed, 82 insertions(+), 272 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 0cea05fe8..4b89cac10 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,6 +1,5 @@ - + us.codecraft webmagic-parent @@ -25,12 +24,6 @@ org.apache.commons commons-lang3 - - org.projectlombok - lombok - 1.18.10 - provided - us.codecraft diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index aa8c5f3fc..1052f06e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -426,7 +426,6 @@ private void onDownloadSuccess(Request request, Page page) { } } else if(site.getRefreshCode().contains(page.getStatusCode())) { logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - downloader.refreshComponent(this); failHandler(request); }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); @@ -440,6 +439,7 @@ private void onDownloaderFail(Request request) { } private void failHandler(Request request){ + downloader.refreshComponent(this); if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 8e8676d0f..ace817554 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -13,8 +13,6 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; -import us.codecraft.webmagic.proxy.RefreshableProxyProvider; -import us.codecraft.webmagic.proxy.ReturnableProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; @@ -95,8 +93,8 @@ public Page download(Request request, Task task) { } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request, e, proxyProvider); - if (proxyProvider != null && proxy != null && proxyProvider instanceof RefreshableProxyProvider && refreshProxyOnError.test(e)) { - ((RefreshableProxyProvider)proxyProvider).refreshProxy(task,proxy); + if (proxyProvider != null && refreshProxyOnError.test(e)) { + proxyProvider.refreshProxy(task,proxy); } if(refreshClientOnError.test(e)) { httpClients.remove(task.getSite().getDomain()); @@ -107,9 +105,8 @@ public Page download(Request request, Task task) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - if (proxyProvider != null && proxy != null && proxyProvider instanceof ReturnableProxyProvider) { - ((ReturnableProxyProvider) proxyProvider).returnProxy(proxy, page, task); - + if (proxyProvider != null && proxy != null) { + proxyProvider.returnProxy(proxy, page, task); } } } @@ -117,8 +114,8 @@ public Page download(Request request, Task task) { @Override public void refreshComponent(Task task) { - if (proxyProvider != null && proxyProvider instanceof RefreshableProxyProvider) { - ((RefreshableProxyProvider) proxyProvider).refreshProxy(task, ((RefreshableProxyProvider) proxyProvider).getCurrentProxy(task)); + if (proxyProvider != null ) { + proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task)); } httpClients.remove(task.getSite().getDomain()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java deleted file mode 100644 index 8e7cb08ac..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/AbstractRefreshableProxyProvider.java +++ /dev/null @@ -1,135 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import lombok.extern.slf4j.Slf4j; -import us.codecraft.webmagic.Task; - -import java.math.BigDecimal; -import java.math.RoundingMode; -import java.util.Comparator; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicReference; -import java.util.concurrent.atomic.LongAdder; - -/** - * @author yaoqiang - * 可刷新的代理提供商抽象实现 - */ -@Slf4j -public abstract class AbstractRefreshableProxyProvider implements RefreshableProxyProvider { - - private final LongAdder totalGet = new LongAdder(); - - private final LongAdder canUse = new LongAdder(); - - private final AtomicReference> usedProxyCache = new AtomicReference<>(); - - private final PriorityBlockingQueue ipQueue = new PriorityBlockingQueue<>(1000, Comparator.comparing(ExpirableProxy::getExpireTime)); - - private final int maxHostNum; - - public AbstractRefreshableProxyProvider(int maxHostNum) { - this.maxHostNum = maxHostNum; - } - - protected void doPut(ExpirableProxy expirableProxy) { - synchronized (ipQueue) { - if (ipQueue.size() <= maxHostNum) { - ipQueue.put(expirableProxy); - } - } - } - - @Override - public void refreshProxy(Task task, Proxy proxy) { - if (proxy != null) { - FutureTask proxyFutureTask = usedProxyCache.get(); - Proxy currentProxy = getCurrentProxy(task); - // 如果在出错到这里的过程中,usedProxyCache被更新过,proxy 就不可能相等,如果依然相等,说明没有更新过 - // 可能没有使用代理的情况 - if (proxy.equals(currentProxy)) { - // 如果此时依然没有更新过,就设置为空 - usedProxyCache.compareAndSet(proxyFutureTask, null); - } - } - } - - @Override - public Proxy getCurrentProxy(Task task) { - FutureTask cache = usedProxyCache.get(); - Proxy currentProxy = null; - try { - if (cache != null) - currentProxy = cache.get(5, TimeUnit.SECONDS); - } catch (InterruptedException e) { - e.printStackTrace(); - log.error(e.getMessage(), e); - Thread.currentThread().interrupt(); - } catch (ExecutionException e) { - e.printStackTrace(); - log.error(e.getCause().getMessage(), e); - } catch (TimeoutException e) { - log.error(e.getMessage(), e); - e.printStackTrace(); - } - return currentProxy; - } - - - private FutureTask buildCacheTask() { - return new FutureTask<>(this::doGet); - } - - - /** - * 特别注意,防止活锁,集cache中总是抛出异常,那么将无限循环,无限报错 - * - * @param task 下载任务 - * @return 返回代理 - */ - @Override - public Proxy getProxy(Task task) { - while (!Thread.currentThread().isInterrupted()) { - FutureTask cache = usedProxyCache.get(); - if (cache == null) { - FutureTask futureTask = buildCacheTask(); - if (usedProxyCache.compareAndSet(null, futureTask)) { - cache = futureTask; - futureTask.run(); - } else { - // 交换失败,需要更新到最新数据 - cache = usedProxyCache.get(); - } - } - try { - if (cache != null) { - - ExpirableProxy proxy = (ExpirableProxy) cache.get(5, TimeUnit.SECONDS); - if (!proxy.isExpire()) - return proxy; - } - usedProxyCache.compareAndSet(cache, null); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - log.error(e.getMessage(), e); - usedProxyCache.compareAndSet(cache, null); - } catch (ExecutionException e) { - log.error(e.getMessage(), e); - usedProxyCache.compareAndSet(cache, null); - } catch (TimeoutException e) { - log.error(e.getMessage(), e); - } - } - return null; - } - - private Proxy doGet() throws InterruptedException { - ExpirableProxy proxy; - do { - proxy = ipQueue.take(); - } while (proxy.isExpire()); - log.info("切换到proxy:ip:{},port:{},ip可用率:{}", proxy.getHost(), proxy.getPort(), BigDecimal.valueOf(canUse.sum()).divide(BigDecimal.valueOf(totalGet.sum()), 2, RoundingMode.HALF_DOWN).doubleValue()); - return proxy; - } - - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java deleted file mode 100644 index f23caaf5f..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ExpirableProxy.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import org.apache.http.annotation.Contract; -import org.apache.http.annotation.ThreadingBehavior; - -import java.time.LocalDateTime; -import java.time.temporal.ChronoUnit; - -/** - * @author yaoqiang - * - * 可以过期的代理 - */ -@Contract(threading = ThreadingBehavior.IMMUTABLE_CONDITIONAL) -public class ExpirableProxy extends Proxy { - private final int ttl; - private final LocalDateTime expireTime; - - - public ExpirableProxy(String host, int port, int ttl, ChronoUnit chronoUnit) { - super(host, port); - this.ttl = ttl; - this.expireTime = LocalDateTime.now().plus(ttl, chronoUnit); - - } - - public boolean isExpire() { - return LocalDateTime.now().isAfter(expireTime); - } - public LocalDateTime getExpireTime(){ - return expireTime; - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 878f8ae33..6554fab51 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -7,28 +7,32 @@ import java.nio.charset.StandardCharsets; import org.apache.commons.lang3.StringUtils; -import org.apache.http.annotation.Contract; -import org.apache.http.annotation.ThreadingBehavior; -@Contract(threading = ThreadingBehavior.IMMUTABLE) public class Proxy { - private final String scheme; + private String scheme; - private final String host; + private String host; - private final int port; + private int port; - private final String username; + private String username; - private final String password; + private String password; - public Proxy(String host, int port, String scheme, String username, String password) { - this.scheme = scheme; - this.host = host; - this.port = port; - this.username = username; - this.password = password; + public static Proxy create(final URI uri) { + Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); + String userInfo = uri.getUserInfo(); + if (userInfo != null) { + String[] up = userInfo.split(":"); + if (up.length == 1) { + proxy.username = up[0].isEmpty() ? null : up[0]; + } else { + proxy.username = up[0].isEmpty() ? null : up[0]; + proxy.password = up[1].isEmpty() ? null : up[1]; + } + } + return proxy; } public Proxy(String host, int port) { @@ -36,30 +40,27 @@ public Proxy(String host, int port) { } public Proxy(String host, int port, String scheme) { - this(host, port, scheme, null, null); + this.host = host; + this.port = port; + this.scheme = scheme; } public Proxy(String host, int port, String username, String password) { - this(host, port, null, username, password); + this.host = host; + this.port = port; + this.username = username; + this.password = password; } - public static Proxy create(final URI uri) { - String userInfo = uri.getUserInfo(); - String username = null; - String password = null; - if (userInfo != null) { - String[] up = userInfo.split(":"); - if (up.length == 1) { - username = up[0].isEmpty() ? null : up[0]; - } else { - username = up[0].isEmpty() ? null : up[0]; - password = up[1].isEmpty() ? null : up[1]; - } - } - return new Proxy(uri.getHost(), uri.getPort(), uri.getScheme(), username, password); + public String getScheme() { + return scheme; } - public String getHost() { + public void setScheme(String scheme) { + this.scheme = scheme; + } + + public String getHost() { return host; } @@ -67,8 +68,6 @@ public int getPort() { return port; } - public String getScheme(){return scheme;} - public String getUsername() { return username; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index b567d582f..b4e7b484d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.proxy; +import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; /** @@ -9,6 +10,32 @@ */ public interface ProxyProvider { + /** + * + * Return proxy to Provider when complete a download. + * @param proxy the proxy config contains host,port and identify info + * @param page the download result + * @param task the download task + */ + void returnProxy(Proxy proxy, Page page, Task task); + + /** + * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 + * + * @param task 下载任务 + * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 + */ + void refreshProxy(Task task,Proxy proxy); + + + /** + * + * 获取当前正在提供的代理 + * + * @param task + * @return + */ + Proxy getCurrentProxy(Task task); /** * Get a proxy for task by some strategy. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java deleted file mode 100644 index 77e1ce2c4..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/RefreshableProxyProvider.java +++ /dev/null @@ -1,30 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import us.codecraft.webmagic.Task; - -/** - * @author yaoqiang - * - * 可以手动刷新的代理供应商 - */ -public interface RefreshableProxyProvider extends ProxyProvider{ - - /** - * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 - * - * @param task 爬虫任务 - * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 - */ - void refreshProxy(Task task,Proxy proxy); - - - /** - * - * 获取当前正在提供的代理 - * - * @param task 工作中的爬虫任务 - * @return 获取当前正在使用的代理 - */ - Proxy getCurrentProxy(Task task); - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java deleted file mode 100644 index 43b49fc3e..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ReturnableProxyProvider.java +++ /dev/null @@ -1,22 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Task; - -/** - * @author yaoqiang - * - * 可归还的代理提供商,代理被取出后,实用完成,可以归还给代理提供商 - */ -public interface ReturnableProxyProvider { - - /** - * - * Return proxy to Provider when complete a download. - * @param proxy the proxy config contains host,port and identify info - * @param page the download result - * @param task the download task - */ - void returnProxy(Proxy proxy, Page page, Task task); - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index fda3e2384..8ad9ce7b1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.proxy; +import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -29,6 +30,15 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) { this.pointer = pointer; } + @Override + public Proxy getCurrentProxy(Task task) { + return null; + } + + @Override + public void refreshProxy(Task task,Proxy proxy) { + + } public static SimpleProxyProvider from(Proxy... proxies) { List proxiesTemp = new ArrayList(proxies.length); @@ -38,6 +48,11 @@ public static SimpleProxyProvider from(Proxy... proxies) { return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp)); } + @Override + public void returnProxy(Proxy proxy, Page page, Task task) { + //Donothing + } + @Override public Proxy getProxy(Task task) { return proxies.get(incrForLoop()); From 4bedd97267568b4e45ee3e49fa5fd3c8439a38d1 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 2 Jan 2021 20:14:02 +0800 Subject: [PATCH 087/257] =?UTF-8?q?Revert=20"=20=E5=88=B7=E6=96=B0?= =?UTF-8?q?=E4=BB=A3=E7=90=86api=E9=87=8D=E6=9E=84=EF=BC=8C=E9=9C=80?= =?UTF-8?q?=E8=A6=81=E6=8F=90=E4=BE=9B=E6=97=A7=E4=BB=A3=E7=90=86=EF=BC=8C?= =?UTF-8?q?=E5=A6=82=E6=9E=9C=E4=BE=9D=E7=84=B6=E6=98=AF=E6=97=A7=E4=BB=A3?= =?UTF-8?q?=E7=90=86=EF=BC=8C=E6=89=8D=E8=BF=9B=E8=A1=8C=E5=88=B7=E6=96=B0?= =?UTF-8?q?=EF=BC=8C=E9=98=B2=E6=AD=A2=E5=BA=94=E5=BB=B6=E8=BF=9F=E5=93=8D?= =?UTF-8?q?=E5=BA=94=E9=80=A0=E6=88=90=E7=9A=84=E8=BF=87=E5=BA=A6=E5=88=B7?= =?UTF-8?q?=E6=96=B0"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 0aa2c3949d29e4c02c199eb30c7adae8f244e1ee. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 7 +------ .../webmagic/downloader/HttpClientDownloader.java | 6 +++--- .../webmagic/downloader/HttpClientGenerator.java | 1 - .../us/codecraft/webmagic/proxy/ProxyProvider.java | 13 +------------ .../webmagic/proxy/SimpleProxyProvider.java | 7 +------ 5 files changed, 6 insertions(+), 28 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 1052f06e0..474b7433e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -426,7 +426,7 @@ private void onDownloadSuccess(Request request, Page page) { } } else if(site.getRefreshCode().contains(page.getStatusCode())) { logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - failHandler(request); + downloader.refreshComponent(this); }else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } @@ -435,11 +435,6 @@ private void onDownloadSuccess(Request request, Page page) { } private void onDownloaderFail(Request request) { - failHandler(request); - } - - private void failHandler(Request request){ - downloader.refreshComponent(this); if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index ace817554..eed49fb4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -54,7 +54,7 @@ public void setRefreshClientOnError(Predicate clientOnError){ this.refreshClientOnError = clientOnError; } public void setRefreshProxyOnError(Predicate proxyOnError) { - this.refreshProxyOnError = proxyOnError; + this.refreshProxyOnError = refreshProxyOnError; } public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { @@ -94,7 +94,7 @@ public Page download(Request request, Task task) { logger.warn("download page {} error", request.getUrl(), e); onError(request, e, proxyProvider); if (proxyProvider != null && refreshProxyOnError.test(e)) { - proxyProvider.refreshProxy(task,proxy); + proxyProvider.refreshProxy(task); } if(refreshClientOnError.test(e)) { httpClients.remove(task.getSite().getDomain()); @@ -115,7 +115,7 @@ public Page download(Request request, Task task) { @Override public void refreshComponent(Task task) { if (proxyProvider != null ) { - proxyProvider.refreshProxy(task,proxyProvider.getCurrentProxy(task)); + proxyProvider.refreshProxy(task); } httpClients.remove(task.getSite().getDomain()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 2d27b79a2..1f20c5a58 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -143,7 +143,6 @@ public void process( SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); - SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index b4e7b484d..da3bec96a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -23,19 +23,8 @@ public interface ProxyProvider { * 代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了 * * @param task 下载任务 - * @param proxy 需要对代理进行验证,如果确实持有的时错误代理,则刷新,否则,继续执行 */ - void refreshProxy(Task task,Proxy proxy); - - - /** - * - * 获取当前正在提供的代理 - * - * @param task - * @return - */ - Proxy getCurrentProxy(Task task); + void refreshProxy(Task task); /** * Get a proxy for task by some strategy. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index 8ad9ce7b1..fd80b3009 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -31,12 +31,7 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) { } @Override - public Proxy getCurrentProxy(Task task) { - return null; - } - - @Override - public void refreshProxy(Task task,Proxy proxy) { + public void refreshProxy(Task task) { } From c489647c4b03bb922cb270e682df97bd9dbbc7ff Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 2 Jan 2021 20:15:10 +0800 Subject: [PATCH 088/257] =?UTF-8?q?Revert=20"=20Downloader=20=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E5=88=B7=E6=96=B0=E7=BB=84=E4=BB=B6=E7=9A=84api,?= =?UTF-8?q?=E6=96=B9=E4=BE=BF=E5=9C=A8spider=E4=B8=AD=E6=93=8D=E4=BD=9C"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2e2a0fdf3e8e614d9a3af146dfa462d0e299ceb5. --- .../main/java/us/codecraft/webmagic/Site.java | 14 -------------- .../main/java/us/codecraft/webmagic/Spider.java | 5 +---- .../webmagic/downloader/Downloader.java | 10 +++------- .../downloader/HttpClientDownloader.java | 11 ----------- .../webmagic/downloader/HttpClientGenerator.java | 16 +++++----------- .../codecraft/webmagic/utils/HttpConstant.java | 1 - .../java/us/codecraft/webmagic/SpiderTest.java | 5 ----- .../downloader/MockGithubDownloader.java | 5 ----- .../webmagic/downloader/PhantomJSDownloader.java | 7 +------ .../downloader/MockGithubDownloader.java | 4 ---- .../downloader/selenium/SeleniumDownloader.java | 5 ----- 11 files changed, 10 insertions(+), 73 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index edbf934da..4879b2825 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -40,12 +40,8 @@ public class Site { private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); - private static final Set DEFAULT_REFRESH_CODE_SET = new HashSet<>(); - - private Set refreshCode = DEFAULT_REFRESH_CODE_SET; private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); private boolean useGzip = true; @@ -53,7 +49,6 @@ public class Site { private boolean disableCookieManagement = false; static { - DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } @@ -202,15 +197,6 @@ public Site setAcceptStatCode(Set acceptStatCode) { return this; } - public Site setRefreshCode(Set refreshCode){ - this.refreshCode = refreshCode; - return this; - } - public Set getRefreshCode(){ - return refreshCode; - - } - /** * get acceptStatCode * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 474b7433e..a5ac8aa28 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -424,10 +424,7 @@ private void onDownloadSuccess(Request request, Page page) { pipeline.process(page.getResultItems(), this); } } - } else if(site.getRefreshCode().contains(page.getStatusCode())) { - logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - downloader.refreshComponent(this); - }else { + } else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 50955012b..f7ced4932 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -18,18 +18,14 @@ public interface Downloader { * Downloads web pages and store in Page object. * * @param request request - * @param task task + * @param task task * @return page */ - Page download(Request request, Task task); + public Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. - * * @param threadNum number of threads */ - void setThread(int threadNum); - - - void refreshComponent(Task task); + public void setThread(int threadNum); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eed49fb4a..f9f8c829f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -111,17 +111,6 @@ public Page download(Request request, Task task) { } } - - @Override - public void refreshComponent(Task task) { - if (proxyProvider != null ) { - proxyProvider.refreshProxy(task); - } - - httpClients.remove(task.getSite().getDomain()); - - } - @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1f20c5a58..80e0f1085 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,17 +1,13 @@ package us.codecraft.webmagic.downloader; -import java.io.File; import java.io.IOException; import java.security.KeyManagementException; -import java.security.KeyStore; -import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.SSLContext; -import javax.net.ssl.SSLContextSpi; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; @@ -28,7 +24,6 @@ import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; @@ -37,7 +32,6 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; -import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,7 +69,7 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { + } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); @@ -83,8 +77,8 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { -// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override @@ -102,10 +96,10 @@ public X509Certificate[] getAcceptedIssuers() { }; - SSLContext sc = SSLContext.getInstance("SSLv3"); + SSLContext sc = SSLContext.getInstance("TLS"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index bfacec351..2d6b8fe2a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -28,7 +28,6 @@ public static abstract class Method { public static abstract class StatusCode { public static final int CODE_200 = 200; - public static final int FORBIDDEN = 403; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 6b9c4232b..4f4a2806d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -57,11 +57,6 @@ public Site getSite() { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { - @Override - public void refreshComponent(Task task) { - - } - @Override public Page download(Request request, Task task) { return new Page().setRawText(""); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 6d764a595..3aa742c10 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -28,11 +28,6 @@ public Page download(Request request, Task task) { return page; } - @Override - public void refreshComponent(Task task) { - - } - @Override public void setThread(int threadNum) { } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index f3751d650..6055bdb0f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -42,12 +42,7 @@ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - - @Override - public void refreshComponent(Task task) { - - } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 774469292..91e3698cf 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -9,10 +9,6 @@
  * @author code4crafter@gmail.com
  */
 public class MockGithubDownloader implements Downloader{
-    @Override
-    public void refreshComponent(Task task) {
-
-    }
 
     private String html = "\n" +
             "\n" +
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index 11b235620..cce293fc9 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -59,11 +59,6 @@ public SeleniumDownloader() {
 		// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
 	}
 
-	@Override
-	public void refreshComponent(Task task) {
-
-	}
-
 	/**
 	 * set sleep time to wait until load success
 	 *

From 33e3fcdf221730a727c54daf8f18ac13bc861c90 Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Sat, 2 Jan 2021 20:27:28 +0800
Subject: [PATCH 089/257] =?UTF-8?q?Revert=20"=E4=BB=A3=E7=90=86=E6=8E=A5?=
 =?UTF-8?q?=E5=8F=A3=E7=9A=84=E4=BF=AE=E6=94=B9=EF=BC=8C=E6=8F=90=E4=BE=9B?=
 =?UTF-8?q?=E5=88=B7=E6=98=9F=E4=BB=A3=E7=90=86API=E3=80=82downloader=20?=
 =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E9=94=99=E8=AF=AF=E6=97=B6=EF=BC=8C=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9Brequest,exception,proxyProvider=E4=B8=89=E4=B8=AA?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0=EF=BC=8C"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit ba69eba669d32fadbbe8b021b85b9b458d2db6aa.
---
 .../downloader/AbstractDownloader.java        |  3 +--
 .../downloader/HttpClientDownloader.java      | 22 +++++++------------
 .../webmagic/proxy/ProxyProvider.java         |  7 ------
 .../webmagic/proxy/SimpleProxyProvider.java   |  5 -----
 4 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
index 05f5686af..c27292d09 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
@@ -3,7 +3,6 @@
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Request;
 import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.proxy.ProxyProvider;
 import us.codecraft.webmagic.selector.Html;
 
 /**
@@ -39,7 +38,7 @@ public Html download(String url, String charset) {
     protected void onSuccess(Request request) {
     }
 
-    protected void onError(Request request, Throwable throwable, ProxyProvider proxyProvider) {
+    protected void onError(Request request) {
     }
 
 }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index f9f8c829f..8cc52dbf1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,5 +1,11 @@
 package us.codecraft.webmagic.downloader;
 
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.Predicate;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.http.HttpResponse;
 import org.apache.http.client.methods.CloseableHttpResponse;
@@ -7,6 +13,7 @@
 import org.apache.http.util.EntityUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Request;
 import us.codecraft.webmagic.Site;
@@ -17,13 +24,6 @@
 import us.codecraft.webmagic.utils.CharsetUtils;
 import us.codecraft.webmagic.utils.HttpClientUtils;
 
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.function.Predicate;
-
 
 /**
  * The http downloader based on HttpClient.
@@ -92,13 +92,7 @@ public Page download(Request request, Task task) {
             return page;
         } catch (IOException e) {
             logger.warn("download page {} error", request.getUrl(), e);
-            onError(request, e, proxyProvider);
-            if (proxyProvider != null  && refreshProxyOnError.test(e)) {
-                proxyProvider.refreshProxy(task);
-            }
-            if(refreshClientOnError.test(e)) {
-                httpClients.remove(task.getSite().getDomain());
-            }
+            onError(request);
             return page;
         } finally {
             if (httpResponse != null) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index da3bec96a..0cef4ed42 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -19,13 +19,6 @@ public interface ProxyProvider {
      */
     void returnProxy(Proxy proxy, Page page, Task task);
 
-    /**
-     *  代理IP是珍贵资源,有可能代理提供者内部代理没有过期,就一直提供某个IP,但这个IP又不可以使用,所以提供一种方式通知提供者,这个代理该刷新了
-     *
-     * @param task  下载任务
-     */
-    void refreshProxy(Task task);
-
     /**
      * Get a proxy for task by some strategy.
      * @param task the download task
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
index fd80b3009..ddef6a88c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -30,11 +30,6 @@ private SimpleProxyProvider(List proxies, AtomicInteger pointer) {
         this.pointer = pointer;
     }
 
-    @Override
-    public void refreshProxy(Task task) {
-
-    }
-
     public static SimpleProxyProvider from(Proxy... proxies) {
         List proxiesTemp = new ArrayList(proxies.length);
         for (Proxy proxy : proxies) {

From 5ceccc62e076d54e46efb25bd3ddbd1357d278ba Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Sat, 2 Jan 2021 20:31:53 +0800
Subject: [PATCH 090/257] =?UTF-8?q?Revert=20"=E6=8F=90=E4=BE=9B=E5=BC=82?=
 =?UTF-8?q?=E5=B8=B8=E5=88=B7=E6=96=B0httpClient=EF=BC=8C=E5=BC=82?=
 =?UTF-8?q?=E5=B8=B8=E5=8F=AF=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=87=8D=E5=86=99?=
 =?UTF-8?q?getHttpClient=E4=BB=A3=E7=A0=81"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 19465089c3ad254e6f35b96cbe707bc6dd33ec62.
---
 .../downloader/HttpClientDownloader.java      | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 8cc52dbf1..1d308bcb5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -2,8 +2,8 @@
 
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.util.HashMap;
 import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
 import java.util.function.Predicate;
 
 import org.apache.commons.io.IOUtils;
@@ -24,7 +24,6 @@
 import us.codecraft.webmagic.utils.CharsetUtils;
 import us.codecraft.webmagic.utils.HttpClientUtils;
 
-
 /**
  * The http downloader based on HttpClient.
  *
@@ -33,7 +32,7 @@
  */
 public class HttpClientDownloader extends AbstractDownloader {
 
-    private final Map httpClients = new ConcurrentHashMap<>();
+    private final Map httpClients = new HashMap();
     private final Logger logger = LoggerFactory.getLogger(getClass());
     private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
 
@@ -46,13 +45,6 @@ public class HttpClientDownloader extends AbstractDownloader {
 
     private Predicate refreshProxyOnError = t -> false;
 
-
-    private Predicate refreshClientOnError = t -> false;
-
-
-    public void setRefreshClientOnError(Predicate clientOnError){
-        this.refreshClientOnError = clientOnError;
-    }
     public void setRefreshProxyOnError(Predicate proxyOnError) {
         this.refreshProxyOnError = refreshProxyOnError;
     }
@@ -70,8 +62,17 @@ private CloseableHttpClient getHttpClient(Site site) {
             return httpClientGenerator.getClient(null);
         }
         String domain = site.getDomain();
-        return httpClients.computeIfAbsent(domain,k->httpClientGenerator.getClient(site));
-
+        CloseableHttpClient httpClient = httpClients.get(domain);
+        if (httpClient == null) {
+            synchronized (this) {
+                httpClient = httpClients.get(domain);
+                if (httpClient == null) {
+                    httpClient = httpClientGenerator.getClient(site);
+                    httpClients.put(domain, httpClient);
+                }
+            }
+        }
+        return httpClient;
     }
 
     @Override

From d0843bee0d1cce6a2e7f01f1182b493437f1a6fb Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Sat, 2 Jan 2021 20:32:35 +0800
Subject: [PATCH 091/257] =?UTF-8?q?Revert=20"=E7=AE=80=E5=8C=96=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 9cc5287743de9715ec3ac10a20636377be41d060.
---
 .../downloader/HttpClientDownloader.java        | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 1d308bcb5..f204e3945 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -32,21 +32,24 @@
  */
 public class HttpClientDownloader extends AbstractDownloader {
 
+    private Logger logger = LoggerFactory.getLogger(getClass());
+
     private final Map httpClients = new HashMap();
-    private final Logger logger = LoggerFactory.getLogger(getClass());
-    private final HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
+
+    private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
 
     private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
 
     private ProxyProvider proxyProvider;
 
-    private final boolean responseHeader = true;
+    private boolean responseHeader = true;
 
+    private volatile boolean refreshProxyOnError = false;
 
-    private Predicate refreshProxyOnError = t -> false;
+    private Predicate throwablePredicate = t->false;
 
-    public void setRefreshProxyOnError(Predicate proxyOnError) {
-        this.refreshProxyOnError = refreshProxyOnError;
+    public void setThrowablePredicate(Predicate predicate){
+        this.throwablePredicate = predicate;
     }
 
     public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
@@ -116,7 +119,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
         String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
         Page page = new Page();
         page.setBytes(bytes);
-        if (!request.isBinaryContent()) {
+        if (!request.isBinaryContent()){
             if (charset == null) {
                 charset = getHtmlCharset(contentType, bytes);
             }

From 30daec480348e3b679dca5f07c8147f540134876 Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Sat, 2 Jan 2021 20:33:17 +0800
Subject: [PATCH 092/257] =?UTF-8?q?Revert=20"=E6=8F=90=E4=BE=9B=E5=87=BA?=
 =?UTF-8?q?=E7=8E=B0=E6=9F=90=E7=A7=8D=E5=BC=82=E5=B8=B8=E5=88=B7=E6=96=B0?=
 =?UTF-8?q?=E4=BB=A3=E7=90=86=EF=BC=8C=E5=BC=82=E5=B8=B8=E5=8F=AF=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 4a6441e7c5923c14d889c7f54af0ef15e5a05cb9.
---
 .../webmagic/downloader/HttpClientDownloader.java        | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index f204e3945..49217e111 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -4,7 +4,6 @@
 import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.function.Predicate;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.http.HttpResponse;
@@ -44,14 +43,6 @@ public class HttpClientDownloader extends AbstractDownloader {
 
     private boolean responseHeader = true;
 
-    private volatile boolean refreshProxyOnError = false;
-
-    private Predicate throwablePredicate = t->false;
-
-    public void setThrowablePredicate(Predicate predicate){
-        this.throwablePredicate = predicate;
-    }
-
     public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
         this.httpUriRequestConverter = httpUriRequestConverter;
     }

From ab6ff7f80939d89d1c35070052bd9923cf61de32 Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Sat, 2 Jan 2021 20:33:32 +0800
Subject: [PATCH 093/257] =?UTF-8?q?Revert=20"pageCount=E4=BF=AE=E6=94=B9"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 9a71f0ac924615d21882e1faa4bbda0c2e5eb7d7.
---
 .../src/main/java/us/codecraft/webmagic/Spider.java      | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index a5ac8aa28..886e74a92 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -24,7 +24,6 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.atomic.LongAdder;
 import java.util.concurrent.locks.Condition;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -103,7 +102,7 @@ public class Spider implements Runnable, Task {
 
     private List spiderListeners;
 
-    private final LongAdder pageCount = new LongAdder();
+    private final AtomicLong pageCount = new AtomicLong(0);
 
     private Date startTime;
 
@@ -324,7 +323,7 @@ public void run() {
                             onError(request);
                             logger.error("process request " + request + " error", e);
                         } finally {
-                            pageCount.increment();
+                            pageCount.incrementAndGet();
                             signalNewUrl();
                         }
                     }
@@ -336,7 +335,7 @@ public void run() {
         if (destroyWhenExit) {
             close();
         }
-        logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.sumThenReset());
+        logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
     }
 
     protected void onError(Request request) {
@@ -651,7 +650,7 @@ public boolean isSpawnUrl() {
      * @since 0.4.1
      */
     public long getPageCount() {
-        return pageCount.sum();
+        return pageCount.get();
     }
 
     /**

From e14a7626321d9324164b21ff23d4fc17be81d57d Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Tue, 5 Jan 2021 23:14:24 +0800
Subject: [PATCH 094/257] Add gitflow-maven-plugin.

---
 pom.xml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pom.xml b/pom.xml
index 6341bc0b5..0a24519cb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -317,7 +317,20 @@
                 maven-release-plugin
                 3.0.0-M1
             
+            
+                com.amashchenko.maven.plugin
+                gitflow-maven-plugin
+            
         
+        
+            
+                
+                    com.amashchenko.maven.plugin
+                    gitflow-maven-plugin
+                    1.15.0
+                
+            
+        
     
 
     

From 0d73f08ef6bdb3abb972b6ddcb3fd1737d93d8eb Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Wed, 6 Jan 2021 02:29:34 +0800
Subject: [PATCH 095/257] Upgrade maven plugins.

---
 pom.xml | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 0a24519cb..08250fd8c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -221,7 +221,6 @@
             
                 org.apache.maven.plugins
                 maven-surefire-plugin
-                3.0.0-M4
                 
                     0
                 
@@ -229,7 +228,6 @@
             
                 org.apache.maven.plugins
                 maven-compiler-plugin
-                3.8.1
                 
                     ${java.version}
                     ${java.version}
@@ -258,12 +256,10 @@
             
                 org.apache.maven.plugins
                 maven-resources-plugin
-                3.1.0
             
             
                 org.apache.maven.plugins
                 maven-jar-plugin
-                3.2.0
                 
                     
                         log4j.xml
@@ -324,6 +320,46 @@
         
         
             
+                
+                    org.apache.maven.plugins
+                    maven-clean-plugin
+                    3.1.0
+                
+                
+                    org.apache.maven.plugins
+                    maven-compiler-plugin
+                    3.8.1
+                
+                
+                    org.apache.maven.plugins
+                    maven-deploy-plugin
+                    3.0.0-M1
+                
+                
+                    org.apache.maven.plugins
+                    maven-install-plugin
+                    3.0.0-M1
+                
+                
+                    org.apache.maven.plugins
+                    maven-jar-plugin
+                    3.2.0
+                
+                
+                    org.apache.maven.plugins
+                    maven-resources-plugin
+                    3.1.0
+                
+                
+                    org.apache.maven.plugins
+                    maven-site-plugin
+                    3.9.0
+                
+                
+                    org.apache.maven.plugins
+                    maven-surefire-plugin
+                    3.0.0-M5
+                
                 
                     com.amashchenko.maven.plugin
                     gitflow-maven-plugin

From 0e01550a79883e7df6c0bd8d0b0ab31156a9412a Mon Sep 17 00:00:00 2001
From: Sutra Zhou 
Date: Wed, 6 Jan 2021 03:13:50 +0800
Subject: [PATCH 096/257] Upgrade dependencies, including the jedis from 2.9.3
 to 3.4.1.

---
 pom.xml                                       | 30 +++---
 .../webmagic/selector/LinksSelector.java      | 12 +--
 .../scheduler/RedisPriorityScheduler.java     | 95 +++++++------------
 .../webmagic/scheduler/RedisScheduler.java    | 33 ++-----
 4 files changed, 64 insertions(+), 106 deletions(-)

diff --git a/pom.xml b/pom.xml
index 08250fd8c..81d0d6cca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -73,17 +73,17 @@
             
                 org.apache.httpcomponents
                 httpcore
-                4.4.13
+                4.4.14
             
             
                 com.google.guava
                 guava
-                30.0-android
+                30.1-jre
             
             
                 com.jayway.jsonpath
                 json-path
-                2.4.0
+                2.5.0
             
             
                 org.slf4j
@@ -103,7 +103,7 @@
             
                 com.alibaba
                 fastjson
-                1.2.69
+                1.2.75
             
             
                 com.github.dreamhead
@@ -125,13 +125,13 @@
             
                 org.assertj
                 assertj-core
-                3.16.1
+                3.18.1
                 test
             
             
                 org.apache.commons
                 commons-lang3
-                3.10
+                3.11
             
             
                 commons-collections
@@ -139,19 +139,19 @@
                 3.2.2
             
             
-			    commons-io
-			    commons-io
-			    2.7
-			
+                commons-io
+                commons-io
+                2.8.0
+            
             
                 org.codehaus.groovy
                 groovy-all
-                2.4.19
+                3.0.7
             
             
                 org.jruby
                 jruby
-                9.2.11.1
+                9.2.14.0
             
             
                 org.jsoup
@@ -171,12 +171,12 @@
             
                 net.sf.saxon
                 Saxon-HE
-                10.1
+                10.3
             
             
                 net.sourceforge.htmlcleaner
                 htmlcleaner
-                2.5
+                2.24
             
             
                 com.github.detro
@@ -191,7 +191,7 @@
             
                 redis.clients
                 jedis
-                2.9.3
+                3.4.1
             
         
     
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
index 5296a74bd..2dafe8ee9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
@@ -1,12 +1,12 @@
 package us.codecraft.webmagic.selector;
 
-import org.jsoup.helper.StringUtil;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
 /**
  * Links selector based on jsoup. Use absolute url. 
* @@ -23,9 +23,9 @@ public String select(Element element) { @Override public List selectList(Element element) { Elements elements = element.select("a"); - List links = new ArrayList(elements.size()); + List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { - if (!StringUtil.isBlank(element0.baseUri())) { + if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 540574ad2..46d47e5a5 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -1,22 +1,23 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; +import java.util.Set; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.Set; - /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ -public class RedisPriorityScheduler extends RedisScheduler -{ +public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; @@ -37,62 +38,44 @@ public RedisPriorityScheduler(JedisPool pool) { } @Override - protected void pushWhenNoDuplicate(Request request, Task task) - { - Jedis jedis = pool.getResource(); - try - { - if(request.getPriority() > 0) + protected void pushWhenNoDuplicate(Request request, Task task) { + try (Jedis jedis = pool.getResource()) { + if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); - else if(request.getPriority() < 0) + } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); - else + } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); + } setExtrasInItem(jedis, request, task); } - finally - { - pool.returnResource(jedis); - } } @Override - public synchronized Request poll(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public synchronized Request poll(Task task) { + try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); - if(StringUtils.isBlank(url)) + if (StringUtils.isBlank(url)) { return null; + } return getExtrasInItem(jedis, url, task); } - finally - { - pool.returnResource(jedis); - } } - private String getRequest(Jedis jedis, Task task) - { + private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); - if(urls.isEmpty()) - { + if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); - if(StringUtils.isBlank(url)) - { + if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); - if(!urls.isEmpty()) - { + if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } - } - else - { + } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } @@ -100,51 +83,39 @@ private String getRequest(Jedis jedis, Task task) } @Override - public void resetDuplicateCheck(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public void resetDuplicateCheck(Task task) { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } - finally - { - pool.returnResource(jedis); - } } - private String getZsetPlusPriorityKey(Task task) - { + private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } - private String getQueueNoPriorityKey(Task task) - { + private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } - private String getZsetMinusPriorityKey(Task task) - { + private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } - private void setExtrasInItem(Jedis jedis,Request request, Task task) - { - if(request.getExtras() != null) - { - String field = DigestUtils.shaHex(request.getUrl()); + private void setExtrasInItem(Jedis jedis,Request request, Task task) { + if (request.getExtras() != null) { + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } - private Request getExtrasInItem(Jedis jedis, String url, Task task) - { + private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); - if(bytes != null) + if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); + } return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c70d88507..19e831321 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -37,21 +39,15 @@ public RedisScheduler(JedisPool pool) { @Override public void resetDuplicateCheck(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); - } finally { - pool.returnResource(jedis); } } @Override public boolean isDuplicate(Request request, Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; - } finally { - pool.returnResource(jedis); } } @@ -62,7 +58,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { - String field = DigestUtils.shaHex(request.getUrl()); + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } @@ -100,14 +96,13 @@ private boolean checkForAdditionalInfo(Request request) { @Override public synchronized Request poll(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); @@ -115,8 +110,6 @@ public synchronized Request poll(Task task) { } Request request = new Request(url); return request; - } finally { - pool.returnResource(jedis); } } @@ -134,23 +127,17 @@ protected String getItemKey(Task task) { @Override public int getLeftRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } @Override public int getTotalRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } } From d0e2776991b3aae0eb745f4e76562712584eb44e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 10 Jan 2021 14:10:32 +0800 Subject: [PATCH 097/257] Upgrade xsoup from 0.3.1 to 0.3.2. --- pom.xml | 7 +------ webmagic-core/pom.xml | 5 ----- webmagic-scripts/pom.xml | 4 ---- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 81d0d6cca..c5b7dfe0e 100644 --- a/pom.xml +++ b/pom.xml @@ -98,7 +98,7 @@ us.codecraft xsoup - 0.3.1 + 0.3.2 com.alibaba @@ -153,11 +153,6 @@ jruby 9.2.14.0 - - org.jsoup - jsoup - 1.10.3 - org.python jython diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4b89cac10..820651a44 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -61,11 +61,6 @@ assertj-core - - org.jsoup - jsoup - - commons-io commons-io diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 121aafaf6..85b735fe0 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -22,10 +22,6 @@ kotlin-stdlib ${kotlin.version} - - org.codehaus.groovy - groovy-all - org.python jython From 2f71f7912c1d104fbd42f7ca14fea4ac764efd8a Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 10 Jan 2021 14:31:40 +0800 Subject: [PATCH 098/257] Fix scm tag. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c5b7dfe0e..97897dbc5 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.6.1 + WebMagic-${project.version} From 683db09133b16ada8f6ea6de12a9b62a1a0705d4 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 00:35:22 +0800 Subject: [PATCH 099/257] Complete testXPath2 assertion. --- .../java/us/codecraft/webmagic/selector/XpathSelectorTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index aa3765a0c..38aac1544 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -8,6 +8,7 @@ import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; + import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1368,7 +1369,7 @@ public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - System.out.println(xpathSelector.select(text)); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); } @Test From 124c52b9884b1c855e47cfcdddbc1e7d9c613dbe Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 01:25:41 +0800 Subject: [PATCH 100/257] Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases. --- pom.xml | 2 +- .../webmagic/selector/Xpath2Selector.java | 36 ++++++++++--------- .../webmagic/selector/XpathSelectorTest.java | 12 +++++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/pom.xml b/pom.xml index 97897dbc5..16e14cfb6 100644 --- a/pom.xml +++ b/pom.xml @@ -171,7 +171,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.24 + 2.5 com.github.detro diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index d8aab6cce..1f1f0a572 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,16 +1,11 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.xpath.XPathEvaluator; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.transform.OutputKeys; @@ -21,12 +16,19 @@ import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; + +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 38aac1544..32906b57a 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.selector; +import java.util.List; + import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; @@ -1368,15 +1370,19 @@ public void testOschina() { public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text)); + Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); - Assert.assertNotNull(select); + Assert.assertEquals("http://www.oschina.net/", select); + + List selectList = xpath2Selector.selectList(html); + Assert.assertEquals(113, selectList.size()); + Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time") From d92dc8397f336c2757ce4559ac92daf7bf82aa61 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 11 Jan 2021 01:46:32 +0800 Subject: [PATCH 101/257] Upgrade htmlcleaner from 2.5 to 2.9, this is the highest version to let Xpath2Selector pass the test cases. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 16e14cfb6..df7d6daf6 100644 --- a/pom.xml +++ b/pom.xml @@ -171,7 +171,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.5 + 2.9 com.github.detro From 54127318a4266fc53037e9f1b51a6eb3102e7aaf Mon Sep 17 00:00:00 2001 From: JustThink Date: Wed, 3 Feb 2021 02:43:53 +1300 Subject: [PATCH 102/257] =?UTF-8?q?SpiderStatus=E4=B8=ADgetPagePerSecond()?= =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=8C=E5=A2=9E=E5=8A=A0=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E9=81=BF=E5=85=8D=E7=A9=BA=E6=8C=87?= =?UTF-8?q?=E9=92=88=EF=BC=8C=E9=81=BF=E5=85=8D=E9=99=A4=E6=95=B0=E4=B8=BA?= =?UTF-8?q?=E9=9B=B6=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/monitor/SpiderStatus.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index a87c040bd..69afe042a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -84,8 +84,13 @@ public Date getStartTime() { @Override public int getPagePerSecond() { - int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; - return getSuccessPageCount() / runSeconds; + if (getStartTime() != null) { + int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; + if (runSeconds != 0) { + return getSuccessPageCount() / runSeconds; + } + } + return -1; } } From 528a8908afe92a858b4ea0bcb3f403137fa9847a Mon Sep 17 00:00:00 2001 From: wecandoitjustthink Date: Sat, 27 Feb 2021 19:59:05 +1300 Subject: [PATCH 103/257] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86List=E5=B1=9E=E6=80=A7=E7=9A=84get=E6=96=B9=E6=B3=95,?= =?UTF-8?q?=E4=BE=9BSpiderMonitor=E7=9A=84=E5=AD=90=E7=B1=BB=E8=8E=B7?= =?UTF-8?q?=E5=8F=96.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/monitor/SpiderMonitor.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index cfb4a8200..b213dda94 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -68,6 +68,10 @@ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderLi return new SpiderStatus(spider, monitorSpiderListener); } + protected List getSpiderStatuses() { + return this.spiderStatuses; + } + public static SpiderMonitor instance() { return INSTANCE; } From dcfd23841310face234c0355b824416c5c12046e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 1 Mar 2021 01:06:42 +0800 Subject: [PATCH 104/257] Polish java version setting. --- pom.xml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index df7d6daf6..12c3dbf80 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,8 @@ UTF-8 UTF-8 - 1.8 + 1.8 + 1.8 4.0.0.RELEASE webmagic-parent @@ -223,10 +224,6 @@ org.apache.maven.plugins maven-compiler-plugin - - ${java.version} - ${java.version} - From 4e8a086dae80e13a503f5a72e0e17d7b96c884fc Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 22 Mar 2021 18:18:10 +0800 Subject: [PATCH 105/257] Pass exception to onError. Fixes #1005. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 13 +++++++++++-- .../java/us/codecraft/webmagic/SpiderListener.java | 9 +++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 886e74a92..54fc22054 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -320,7 +320,7 @@ public void run() { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); @@ -338,10 +338,19 @@ public void run() { logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated protected void onError(Request request) { + } + + protected void onError(Request request, Exception e) { + this.onError(request); + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { - spiderListener.onError(request); + spiderListener.onError(request, e); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 067818038..8f10e0ef0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -10,5 +10,14 @@ public interface SpiderListener { public void onSuccess(Request request); + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated public void onError(Request request); + + default void onError(Request request, Exception e) { + this.onError(request); + } + } From be6f5ff77114eed558f3af86781395dabe9ad8f6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 22 Mar 2021 18:18:42 +0800 Subject: [PATCH 106/257] Add missing @Deprecated annotations. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 54fc22054..5940e738d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -208,7 +208,8 @@ public Spider setScheduler(Scheduler scheduler) { * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ - public Spider pipeline(Pipeline pipeline) { + @Deprecated + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -258,7 +259,8 @@ public Spider clearPipeline() { * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @deprecated */ - public Spider downloader(Downloader downloader) { + @Deprecated + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } From e2c143b52f7685df4a331d954a56556a11bc98a5 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Fri, 2 Apr 2021 02:58:48 +0000 Subject: [PATCH 107/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-1048302 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1020439 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1070799 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1082234 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1082235 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1082236 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-564897 - https://snyk.io/vuln/SNYK-JAVA-ORGFREEMARKER-1076795 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6341bc0b5..1800c9790 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ com.github.dreamhead moco-core - 1.1.0 + 1.2.0 test From 76f625c02e552ccd2834cd38cb0f46e2f3037db7 Mon Sep 17 00:00:00 2001 From: linweisen Date: Fri, 9 Apr 2021 17:00:00 +0800 Subject: [PATCH 108/257] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=8F=AF=E6=81=A2?= =?UTF-8?q?=E5=A4=8D=E7=88=AC=E5=8F=96=E5=86=85=E5=AE=B9=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-samples/pom.xml | 20 +++++ .../recover/DuplicateStorageRemover.java | 82 +++++++++++++++++ .../webmagic/recover/MmapQueueScheduler.java | 89 +++++++++++++++++++ .../webmagic/recover/RecoverSample.java | 22 +++++ 4 files changed, 213 insertions(+) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3699fa66e..6c0e59b3f 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -24,6 +24,26 @@ junit junit + + org.mapdb + mapdb + 3.0.7 + + + com.fasterxml.jackson.core + jackson-core + 2.9.5 + + + com.fasterxml.jackson.core + jackson-annotations + 2.9.5 + + + com.fasterxml.jackson.core + jackson-databind + 2.9.5 + diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java new file mode 100644 index 000000000..5bf249e0f --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.recover; + +import com.google.common.base.Charsets; +import com.google.common.hash.BloomFilter; +import com.google.common.hash.Funnels; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author :linweisen + * @date :Created in 2021/4/9 14:46 + * @description:${description} + * @modified By: + * @version: 1.0 + */ +public class DuplicateStorageRemover implements DuplicateRemover { + + private DB db; + + private static String DATABASE_NAME = "duplicate"; + + private IndexTreeList urlDuplicateQueue; + + private BloomFilter bloomFilter; + + private AtomicInteger counter; + + public DuplicateStorageRemover(String path) { + + String duplicatStoragePath = path; + + DB db = DBMaker.fileDB(duplicatStoragePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + + this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen(); + + counter = new AtomicInteger(this.urlDuplicateQueue.size()); + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + for (String url : this.urlDuplicateQueue){ + bloomFilter.put(url); + } + + } + + @Override + public boolean isDuplicate(Request request, Task task) { + String url = request.getUrl(); + boolean isDuplicate = bloomFilter.mightContain(url); + if (!isDuplicate) { + bloomFilter.put(url); + urlDuplicateQueue.add(url); + this.db.commit(); + counter.incrementAndGet(); + } + return isDuplicate; + } + + @Override + public void resetDuplicateCheck(Task task) { + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + this.urlDuplicateQueue.clear(); + } + + @Override + public int getTotalRequestsCount(Task task) { + return counter.get(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java new file mode 100644 index 000000000..07cfa22d0 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic.recover; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.IOException; + +/** + * @author :linweisen + * @date :Created in 2021/4/9 14:38 + * @description:${description} + * @modified By: + * @version: 1.0 + */ +public class MmapQueueScheduler extends DuplicateRemovedScheduler { + + private DB db; + + private static String DATABASE_NAME = "queue"; + + private IndexTreeList queue; + + private static ObjectMapper mapper; + + public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) { + super.setDuplicateRemover(duplicateRemover); + + String queuePath = path; + + DB db = DBMaker.fileDB(queuePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + this.mapper = new ObjectMapper(); + this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen(); + } + + @Override + public Request poll(Task task) { + if (this.queue.size() > 0){ + String s = queue.remove(0); + return fromJson(s, Request.class); + }else{ + return null; + } + + } + + @Override + public void pushWhenNoDuplicate(Request request, Task task) { + queue.add(toJson(request)); + this.db.commit(); + } + + public String toJson(Object object) { + try { + return mapper.writeValueAsString(object); + } catch (IOException e) { + logger.warn("write to json string error:" + object, e); + return null; + } + } + + public T fromJson(String jsonString, Class clazz) { + if (StringUtils.isEmpty(jsonString)) { + return null; + } + try { + return mapper.readValue(jsonString, clazz); + } catch (IOException e) { + logger.warn("parse json string error:" + jsonString, e); + return null; + } + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java new file mode 100644 index 000000000..4fb91a0d2 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.recover; + + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.samples.SinaBlogProcessor; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +/** + * @author code4crafter@gmail.com
+ */ +public class RecoverSample { + + public static void main(String[] args) { + String storage = "queue"; + String duplicate = "duplicate"; + Spider spider = new Spider(new SinaBlogProcessor()); + DuplicateRemover remover = new DuplicateStorageRemover(duplicate); + spider.setScheduler(new MmapQueueScheduler(remover, storage)); + spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} From dba0ddb92cd9a80553a0d01eec94b0ac97e4f7de Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 28 Apr 2021 12:17:52 +0800 Subject: [PATCH 109/257] Remove unknown tag from javadoc. --- .../codecraft/webmagic/recover/DuplicateStorageRemover.java | 4 ---- .../us/codecraft/webmagic/recover/MmapQueueScheduler.java | 4 ---- 2 files changed, 8 deletions(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java index 5bf249e0f..bee80e775 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -15,10 +15,6 @@ /** * @author :linweisen - * @date :Created in 2021/4/9 14:46 - * @description:${description} - * @modified By: - * @version: 1.0 */ public class DuplicateStorageRemover implements DuplicateRemover { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java index 07cfa22d0..4cee18afd 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -15,10 +15,6 @@ /** * @author :linweisen - * @date :Created in 2021/4/9 14:38 - * @description:${description} - * @modified By: - * @version: 1.0 */ public class MmapQueueScheduler extends DuplicateRemovedScheduler { From 189c5962e6a6f68ed1c76c517d59353031cbb77a Mon Sep 17 00:00:00 2001 From: Guy Korland Date: Tue, 18 May 2021 16:55:29 +0300 Subject: [PATCH 110/257] Update to Jedis 3.6.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 12c3dbf80..c1f16b75b 100644 --- a/pom.xml +++ b/pom.xml @@ -187,7 +187,7 @@ redis.clients jedis - 3.4.1 + 3.6.0 From 129e9184bf3b08cdaeb10b89b43700725db00f42 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 3 Jun 2021 03:28:14 +0000 Subject: [PATCH 111/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-1078499 - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-1298655 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1800c9790..f4a268a5b 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,7 @@ com.jayway.jsonpath json-path - 2.4.0 + 2.6.0 org.slf4j From db70b6e095cf070bdea46816371be3f33848b9b9 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 22 Jun 2021 21:58:49 +0800 Subject: [PATCH 112/257] Add maven reports. --- pom.xml | 59 ++++++++++++++++++++++++++++++-- src/site/site.xml | 23 +++++++++++++ webmagic-coverage/pom.xml | 72 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 src/site/site.xml create mode 100644 webmagic-coverage/pom.xml diff --git a/pom.xml b/pom.xml index c1f16b75b..4468ba09a 100644 --- a/pom.xml +++ b/pom.xml @@ -50,6 +50,7 @@ webmagic-selenium webmagic-saxon webmagic-samples + webmagic-coverage @@ -217,9 +218,6 @@ org.apache.maven.plugins maven-surefire-plugin - - 0 - org.apache.maven.plugins @@ -305,6 +303,24 @@ maven-release-plugin 3.0.0-M1 + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + report + verify + + report + + + + com.amashchenko.maven.plugin gitflow-maven-plugin @@ -352,6 +368,11 @@ maven-surefire-plugin 3.0.0-M5 + + org.jacoco + jacoco-maven-plugin + 0.8.7 + com.amashchenko.maven.plugin gitflow-maven-plugin @@ -361,6 +382,38 @@
+ + + + org.apache.maven.plugins + maven-javadoc-plugin + + none + + + + org.apache.maven.plugins + maven-jxr-plugin + + + org.apache.maven.plugins + maven-pmd-plugin + + + org.apache.maven.plugins + maven-surefire-report-plugin + + + org.codehaus.mojo + taglist-maven-plugin + + + com.github.spotbugs + spotbugs-maven-plugin + + + + release diff --git a/src/site/site.xml b/src/site/site.xml new file mode 100644 index 000000000..d2d5caacd --- /dev/null +++ b/src/site/site.xml @@ -0,0 +1,23 @@ + + + org.apache.maven.skins + maven-fluido-skin + 1.9 + + + + + + + + + true + true + true + pull-right + + + diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml new file mode 100644 index 000000000..b1998a316 --- /dev/null +++ b/webmagic-coverage/pom.xml @@ -0,0 +1,72 @@ + + + 4.0.0 + + + us.codecraft + webmagic-parent + 0.7.4 + + + webmagic-coverage + pom + webmagic-coverage + Compute aggregated test code coverage + + + true + + + + + ${project.groupId} + webmagic-core + ${project.version} + + + ${project.groupId} + webmagic-extension + ${project.version} + + + ${project.groupId} + webmagic-scripts + ${project.version} + + + ${project.groupId} + webmagic-selenium + ${project.version} + + + ${project.groupId} + webmagic-saxon + ${project.version} + + + ${project.groupId} + webmagic-samples + ${project.version} + + + + + + + org.jacoco + jacoco-maven-plugin + + + + report-aggregate + + + + + + + + From 31a00f5f8e73b90bc28833b06ec6aa649b07f245 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:09:02 +0800 Subject: [PATCH 113/257] Set gitflow-maven-plugin versionTagPrefix. --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index 4468ba09a..efa5b42cb 100644 --- a/pom.xml +++ b/pom.xml @@ -324,6 +324,11 @@ com.amashchenko.maven.plugin gitflow-maven-plugin + + + WebMagic- + + From 14b09a33852a022209437216af48f509f654b1c6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:36:11 +0800 Subject: [PATCH 114/257] Update maven plugin versions. --- pom.xml | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index efa5b42cb..6a237e649 100644 --- a/pom.xml +++ b/pom.xml @@ -208,7 +208,7 @@ - 3.0.5 + 3.3.9 @@ -358,21 +358,41 @@ maven-jar-plugin 3.2.0 + + org.apache.maven.plugins + maven-jxr-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-pmd-plugin + 3.14.0 + org.apache.maven.plugins maven-resources-plugin - 3.1.0 + 3.2.0 org.apache.maven.plugins maven-site-plugin - 3.9.0 + 3.9.1 org.apache.maven.plugins maven-surefire-plugin 3.0.0-M5 + + org.apache.maven.plugins + maven-surefire-report-plugin + 3.0.0-M5 + + + org.codehaus.mojo + taglist-maven-plugin + 2.4 + org.jacoco jacoco-maven-plugin @@ -383,6 +403,11 @@ gitflow-maven-plugin 1.15.0 + + com.github.spotbugs + spotbugs-maven-plugin + 4.2.3 + From e3e66fb270782813cc35e1189949dd4ac2465299 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:38:38 +0800 Subject: [PATCH 115/257] Upgrade webmagic-samples dependencies. --- webmagic-samples/pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 6c0e59b3f..bdca2b62c 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,22 +27,22 @@ org.mapdb mapdb - 3.0.7 + 3.0.8 com.fasterxml.jackson.core jackson-core - 2.9.5 + 2.13.0-rc1 com.fasterxml.jackson.core jackson-annotations - 2.9.5 + 2.13.0-rc1 com.fasterxml.jackson.core jackson-databind - 2.9.5 + 2.13.0-rc1 From 4e51a4f68bbba56a4dc4dc9cf2c3128027a2991b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:42:55 +0800 Subject: [PATCH 116/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 6a237e649..990805e4d 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.4 + 0.7.5 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 820651a44..ec718a1e3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index b1998a316..16ed1b456 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 87900efda..85d5c6394 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bdca2b62c..dda182160 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index d3a57f256..119e50f15 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 85b735fe0..1aca5b3af 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d0cb77c06..42a6da905 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.4 + 0.7.5 4.0.0 From 113eaa4baeb8dcb8d45a62d49aff5b75ead34c2e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 12:57:14 +0800 Subject: [PATCH 117/257] Bump version number to 0.7.5. --- README-zh.md | 7 ++++--- README.md | 6 ++++-- pom.xml | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README-zh.md b/README-zh.md index c5ebe15bf..62b3c9a5e 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,9 +1,10 @@ ![logo](http://webmagic.io/images/logo.jpeg) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) - 官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 @@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.4 + 0.7.5 us.codecraft webmagic-extension - 0.7.4 + 0.7.5 ``` diff --git a/README.md b/README.md index e5cd511d2..14aeac7b1 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) >A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. @@ -23,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.4 + 0.7.5 us.codecraft webmagic-extension - 0.7.4 + 0.7.5 ``` diff --git a/pom.xml b/pom.xml index 990805e4d..51e6fdb85 100644 --- a/pom.xml +++ b/pom.xml @@ -275,7 +275,7 @@ 3.2.0 UTF-8 - WebMagic 0.7.4 + WebMagic ${project.version} en_US From c5a037a8072575b0938bfc26b0e326931f7a6b16 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 13:02:46 +0800 Subject: [PATCH 118/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 51e6fdb85..cda7ad1ea 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec718a1e3..049477cb4 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 16ed1b456..e6e606825 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 85d5c6394..741b081d8 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index dda182160..c5582c0b3 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 119e50f15..d4d3efa18 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1aca5b3af..fe4ef6840 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 42a6da905..be3637692 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 From ab5d81a6b6ab215e3450cb2fde94df12c5e49544 Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 17:17:22 +0800 Subject: [PATCH 119/257] perfect Spider.run to avoid some rare concurrent issue, change the Spider.emptySleepTime to long type --- .../java/us/codecraft/webmagic/Spider.java | 89 +++++++++++++------ 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e738d..65c0ceea9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -106,7 +106,7 @@ public class Spider implements Runnable, Task { private Date startTime; - private int emptySleepTime = 30000; + private long emptySleepTime = 30000; /** * create a spider with pageProcessor. @@ -305,32 +305,52 @@ protected void initComponent() { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider {} started!",getUUID()); + logger.info("Spider {} started!", getUUID()); + // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - final Request request = scheduler.poll(this); - if (request == null) { - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - break; - } - // wait until new url added - waitNewUrl(); - } else { - threadPool.execute(new Runnable() { - @Override - public void run() { - try { - processRequest(request); - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.error("process request " + request + " error", e); - } finally { - pageCount.incrementAndGet(); - signalNewUrl(); + Request poll = scheduler.poll(this); + if (poll == null) { + if (threadPool.getThreadAlive() == 0) { + //no alive thread anymore , try again + poll = scheduler.poll(this); + if(poll==null) { + if (exitWhenComplete) { + break; + }else{ + // wait + try { + Thread.sleep(emptySleepTime); + continue; + } catch (InterruptedException e) { + break; + } } } - }); + }else { + // wait until new url added, + if(waitNewUrl()) + //if interrupted + break; + continue; + } } + final Request request = poll; + //this may swallow the interruption + threadPool.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(request); + onSuccess(request); + } catch (Exception e) { + onError(request,e); + logger.error("process request " + request + " error", e); + } finally { + pageCount.incrementAndGet(); + signalNewUrl(); + } + } + }); } stat.set(STAT_STOPPED); // release some resources @@ -565,16 +585,24 @@ public Spider addRequest(Request... requests) { return this; } - private void waitNewUrl() { + /** + * + * @return isInterrupted + */ + private boolean waitNewUrl() { + // now there may not be any thread live newUrlLock.lock(); try { - //double check - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - return; + //double check,unnecessary, unless very fast concurrent + if (threadPool.getThreadAlive() == 0) { + return false; } + //wait for amount of time newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; } catch (InterruptedException e) { - logger.warn("waitNewUrl - interrupted, error {}", e); + // logger.warn("waitNewUrl - interrupted, error {}", e); + return true; } finally { newUrlLock.unlock(); } @@ -772,7 +800,10 @@ public Scheduler getScheduler() { * * @param emptySleepTime In MILLISECONDS. */ - public void setEmptySleepTime(int emptySleepTime) { + public void setEmptySleepTime(long emptySleepTime) { + if(emptySleepTime<=0){ + throw new IllegalArgumentException("emptySleepTime should be more than zero!"); + } this.emptySleepTime = emptySleepTime; } } From fcdb9074d69543b81fd350075d182ce1eeaf26ac Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 18:23:04 +0800 Subject: [PATCH 120/257] =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=20Spider.ru?= =?UTF-8?q?n=20=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 65c0ceea9..bc8bb94c5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -313,10 +313,10 @@ public void run() { if (threadPool.getThreadAlive() == 0) { //no alive thread anymore , try again poll = scheduler.poll(this); - if(poll==null) { + if (poll == null) { if (exitWhenComplete) { break; - }else{ + } else { // wait try { Thread.sleep(emptySleepTime); @@ -326,9 +326,9 @@ public void run() { } } } - }else { + } else { // wait until new url added, - if(waitNewUrl()) + if (waitNewUrl()) //if interrupted break; continue; @@ -343,7 +343,7 @@ public void run() { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request,e); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); From cdbba362bb4c80416afb23b3b013069b1a7ca7ee Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 23 Sep 2021 07:48:41 +0000 Subject: [PATCH 121/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMGITHUBJNR-1570422 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d721fa405..90d124d64 100644 --- a/pom.xml +++ b/pom.xml @@ -153,7 +153,7 @@ org.jruby jruby - 9.2.14.0 + 9.3.0.0 org.python From 34da2fb3a02708b562ec747679ef0cd8d171a042 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 24 Oct 2021 23:20:38 +0800 Subject: [PATCH 122/257] Make PageProcessor#getSite be default method. Closes #1040. --- .../webmagic/processor/PageProcessor.java | 26 +++++++----- .../webmagic/processor/PageProcessorTest.java | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 1fb125c72..3d79b96a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,13 +4,16 @@ import us.codecraft.webmagic.Site; /** - * Interface to be implemented to customize a crawler.
- *
+ * Interface to be implemented to customize a crawler. + * + *

* In PageProcessor, you can customize: - *
- * start urls and other settings in {@link Site}
- * how the urls to fetch are detected
- * how the data are extracted and stored
+ *

+ *
    + *
  • start URLs and other settings in {@link Site}
  • + *
  • how the URLs to fetch are detected
  • + *
  • how the data are extracted and stored
  • + *
* * @author code4crafter@gmail.com
* @see Site @@ -20,17 +23,20 @@ public interface PageProcessor { /** - * process the page, extract urls to fetch, extract the data and store + * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ - public void process(Page page); + void process(Page page); /** - * get the site settings + * Returns the site settings. * * @return site * @see Site */ - public Site getSite(); + default Site getSite() { + return Site.me(); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java new file mode 100644 index 000000000..ebb1225cc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.processor; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; + +public class PageProcessorTest { + + @Test + public void testGetSite() { + Site actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + }.getSite(); + + assertEquals(Site.me(), actualSite); + + actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + @Override + public Site getSite() { + return Site.me().setTimeOut(123); + }; + + }.getSite(); + + assertEquals(Site.me().setTimeOut(123), actualSite); + } + +} From 0c2a7daf4b58ff1b308e0030e55a38092a50bcef Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 16 Dec 2021 13:32:43 +0000 Subject: [PATCH 123/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-1078499 - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-1298655 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 90d124d64..10940752c 100644 --- a/pom.xml +++ b/pom.xml @@ -85,7 +85,7 @@ com.jayway.jsonpath json-path - 2.5.0 + 2.6.0 org.slf4j From 88da899a897ec2b86f8c7b3227e0ac310f842200 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 1 Jan 2022 18:51:21 +0000 Subject: [PATCH 124/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-2326698 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1584063 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-1584064 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-2314893 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 10940752c..a4c2db16c 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,7 @@ com.github.dreamhead moco-core - 1.2.0 + 1.3.0 test From a4f2bead800061b27bd767ecf812426498d7536b Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 18 Jan 2022 16:51:10 +0000 Subject: [PATCH 125/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJDOM-1309669 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a4c2db16c..db80066d9 100644 --- a/pom.xml +++ b/pom.xml @@ -173,7 +173,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.9 + 2.26 com.github.detro From 3776de6de2ef4490971cd7740d50fb1844225ad7 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Fri, 11 Mar 2022 16:26:38 +0000 Subject: [PATCH 126/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-2421244 --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index dda182160..ef9276d18 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.13.0-rc1 + 2.13.0 From 90e3d631fb070a6e55c3358deace71e94ce861e4 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Mon, 21 Mar 2022 17:09:14 +0000 Subject: [PATCH 127/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-2421244 --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ef9276d18..0f6592d52 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.13.0 + 2.13.2 From 9628739cebc098489999d894a627706f2199603d Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 26 Mar 2022 16:18:31 +0000 Subject: [PATCH 128/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-2421244 --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 0f6592d52..7f7ceb228 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.13.2 + 2.13.2.1 From 137e7b56ca15ce1404b5badfcf48a977c9cc2c27 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 12 Apr 2022 16:18:38 +0000 Subject: [PATCH 129/257] fix: webmagic-scripts/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJETBRAINSKOTLIN-2628385 --- webmagic-scripts/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100755 => 100644 webmagic-scripts/pom.xml diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100755 new mode 100644 index 1aca5b3af..f0c167955 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -9,7 +9,7 @@ webmagic-scripts - 1.1.2-2 + 1.6.0 From 54da7af17eaffeb54360b4ed81639d84bd064281 Mon Sep 17 00:00:00 2001 From: David Hsing Date: Tue, 3 May 2022 17:42:42 +0800 Subject: [PATCH 130/257] change dependency versions into properties change dependency versions into properties update commons-collections from 3.x to 4.4 --- pom.xml | 78 ++++++++++++------- webmagic-core/pom.xml | 4 +- .../java/us/codecraft/webmagic/Spider.java | 26 ++++--- .../webmagic/selector/AbstractSelectable.java | 2 +- .../webmagic/selector/CssSelector.java | 8 +- .../webmagic/selector/JsonPathSelector.java | 20 +++-- .../webmagic/selector/XpathSelector.java | 6 +- .../downloader/HttpClientDownloaderTest.java | 31 +++++--- .../downloader/MockGithubDownloader.java | 8 +- .../codecraft/webmagic/model/PageMocker.java | 8 +- .../webmagic/samples/AngularJSProcessor.java | 6 +- .../samples/InfoQMiniBookProcessor.java | 2 +- .../webmagic/scripts/ScriptProcessor.java | 19 ++--- .../scripts/ScriptProcessorBuilder.java | 8 +- 14 files changed, 139 insertions(+), 87 deletions(-) diff --git a/pom.xml b/pom.xml index cda7ad1ea..3774b4b25 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,31 @@ UTF-8 1.8 1.8 + 3.18.1 + 1.4 + 4.4 + 2.11.0 + 3.12.0 + 1.2.75 + 3.0.10 + 31.1-jre + 2.26 + 4.5.13 + 4.4.14 + 3.7.1 + 9.2.14.0 + 2.6.0 + 4.13.2 + 2.7.2 + 1.2.17 + 1.10.19 + 1.1.0 + 1.2.0 + 10.3 + 3.141.59 + 1.7.36 4.0.0.RELEASE + 0.3.2
webmagic-parent webmagic-parent @@ -58,59 +82,59 @@ junit junit - 4.13.1 + ${junit.version} test org.mockito mockito-all - 1.10.19 + ${mockito-all.version} test org.apache.httpcomponents httpclient - 4.5.13 + ${httpclient.version} org.apache.httpcomponents httpcore - 4.4.14 + ${httpcore.version} com.google.guava guava - 30.1-jre + ${guava.version} com.jayway.jsonpath json-path - 2.5.0 + ${json-path.version} org.slf4j slf4j-api - 1.7.30 + ${slf4j.version} org.slf4j slf4j-log4j12 - 1.7.30 + ${slf4j.version} us.codecraft xsoup - 0.3.2 + ${xsoup.version} com.alibaba fastjson - 1.2.75 + ${fastjson.version} com.github.dreamhead moco-core - 1.1.0 + ${moco.version} test @@ -122,73 +146,73 @@ log4j log4j - 1.2.17 + ${log4j.version} org.assertj assertj-core - 3.18.1 + ${assertj.version} test org.apache.commons commons-lang3 - 3.11 + ${commons-lang3.version} - commons-collections - commons-collections - 3.2.2 + org.apache.commons + commons-collections4 + ${commons-collections4.version} commons-io commons-io - 2.8.0 + ${commons-io.version} org.codehaus.groovy groovy-all - 3.0.7 + ${groovy-all.version} org.jruby jruby - 9.2.14.0 + ${jruby.version} org.python jython - 2.7.2 + ${jython.version} org.seleniumhq.selenium selenium-java - 3.141.59 + ${selenium-java.version} net.sf.saxon Saxon-HE - 10.3 + ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner - 2.9 + ${htmlcleaner.version} com.github.detro phantomjsdriver - 1.2.0 + ${phantomjsdriver.version} commons-cli commons-cli - 1.4 + ${commons-cli.version} redis.clients jedis - 3.6.0 + ${jedis.version} diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 049477cb4..64b8013f2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -52,8 +52,8 @@ - commons-collections - commons-collections + org.apache.commons + commons-collections4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bc8bb94c5..00091c90a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,20 @@ package us.codecraft.webmagic; -import org.apache.commons.collections.CollectionUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,16 +31,6 @@ import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; -import java.io.Closeable; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb55215..8775af108 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; import java.util.ArrayList; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; /** * @author code4crafer@gmail.com diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 6a638dbff..cfe55472a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - /** * CSS selector. Based on Jsoup. * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0baeb5..aa9a903f7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.selector; -import com.alibaba.fastjson.JSON; -import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; import java.util.Map; +import com.alibaba.fastjson.JSON; +import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
@@ -16,15 +16,20 @@ */ public class JsonPathSelector implements Selector { - private String jsonPathStr; + private final String jsonPathStr; - private JsonPath jsonPath; + private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } + @SuppressWarnings("unused") + public String getJsonPathStr() { + return jsonPathStr; + } + @Override public String select(String text) { Object object = jsonPath.read(text); @@ -32,8 +37,8 @@ public String select(String text) { return null; } if (object instanceof List) { - List list = (List) object; - if (list != null && list.size() > 0) { + List list = (List) object; + if (list.size() > 0) { return toString(list.iterator().next()); } } @@ -49,8 +54,9 @@ private String toString(Object object) { } @Override + @SuppressWarnings("unchecked") public List selectList(String text) { - List list = new ArrayList(); + List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 8a980a50d..4fa14699e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; -import java.util.List; - /** * XPath selector based on Xsoup.
* diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ece060003..780ca7529 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,9 +1,10 @@ package us.codecraft.webmagic.downloader; -import com.github.dreamhead.moco.HttpServer; -import com.github.dreamhead.moco.Runnable; -import com.github.dreamhead.moco.Runner; -import org.apache.commons.collections.map.HashedMap; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -11,6 +12,9 @@ import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; +import com.github.dreamhead.moco.HttpServer; +import com.github.dreamhead.moco.Runnable; +import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -21,12 +25,19 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.Map; - -import static com.github.dreamhead.moco.Moco.*; +import static com.github.dreamhead.moco.Moco.and; +import static com.github.dreamhead.moco.Moco.by; +import static com.github.dreamhead.moco.Moco.cookie; +import static com.github.dreamhead.moco.Moco.eq; +import static com.github.dreamhead.moco.Moco.form; +import static com.github.dreamhead.moco.Moco.header; +import static com.github.dreamhead.moco.Moco.httpServer; +import static com.github.dreamhead.moco.Moco.method; +import static com.github.dreamhead.moco.Moco.not; +import static com.github.dreamhead.moco.Moco.query; +import static com.github.dreamhead.moco.Moco.text; +import static com.github.dreamhead.moco.Moco.uri; +import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c10..58dd3a6fa 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.io.InputStream; /** * @author code4crafter@gmail.com @@ -19,7 +21,7 @@ public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { - page.setRawText(IOUtils.toString(resourceAsStream)); + page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java index 4b0c133cb..0451edcfe 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -1,11 +1,13 @@ package us.codecraft.webmagic.model; + +import java.io.IOException; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; /** * @author code4crafter@gmail.com @@ -16,7 +18,7 @@ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; @@ -24,7 +26,7 @@ public Page getMockJsonPage() throws IOException { public Page getMockPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index ab560e451..46476bbc8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; -import java.util.List; - /** * @author code4crafter@gmail.com * @since 0.5.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 280f8f186..33dd6aa35 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 1822318c6..78c9d87c8 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,5 +1,14 @@ package us.codecraft.webmagic.scripts; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.Map; +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptException; import org.apache.commons.io.IOUtils; import org.jruby.RubyHash; import org.python.core.PyDictionary; @@ -7,14 +16,6 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; -import javax.script.ScriptContext; -import javax.script.ScriptEngine; -import javax.script.ScriptException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Map; - /** * @author code4crafter@gmail.com * @since 0.4.1 @@ -39,7 +40,7 @@ public ScriptProcessor(Language language, String script, int threadNum) { enginePool = new ScriptEnginePool(language, threadNum); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); try { - defines = IOUtils.toString(resourceAsStream); + defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { throw new IllegalArgumentException(e); } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java index 76b3e8640..4691528ad 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.scripts; -import org.apache.commons.io.IOUtils; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import org.apache.commons.io.IOUtils; + /** * @author code4crafter@gmail.com @@ -35,7 +37,7 @@ public ScriptProcessorBuilder language(Language language) { public ScriptProcessorBuilder scriptFromFile(String fileName) { try { InputStream resourceAsStream = new FileInputStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); @@ -46,7 +48,7 @@ public ScriptProcessorBuilder scriptFromFile(String fileName) { public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { try { InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); From 1ece3ce344cd2b04c8d51a5954e4211f14dcb973 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 7 Jun 2022 19:18:20 +0000 Subject: [PATCH 131/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMALIBABA-2859222 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index db80066d9..bef91c3a0 100644 --- a/pom.xml +++ b/pom.xml @@ -105,7 +105,7 @@ com.alibaba fastjson - 1.2.75 + 1.2.83 com.github.dreamhead From 31cf237e7c7704bc2ba9dc53592e8a111fec0711 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 29 Jun 2022 19:41:24 +0000 Subject: [PATCH 132/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJSOUP-1567345 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index bef91c3a0..dea6deac2 100644 --- a/pom.xml +++ b/pom.xml @@ -100,7 +100,7 @@ us.codecraft xsoup - 0.3.2 + 0.3.4 com.alibaba From 565300cbaa24f65277040d975d0a432c1a6f76b9 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 23 Jul 2022 05:43:39 +0000 Subject: [PATCH 133/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMSQUAREUPOKHTTP3-2958044 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index dea6deac2..a2d39f4bc 100644 --- a/pom.xml +++ b/pom.xml @@ -163,7 +163,7 @@ org.seleniumhq.selenium selenium-java - 3.141.59 + 4.0.0 net.sf.saxon From 16221e391d58b624fb777ff9725f15d83eabbc6b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 5 Aug 2022 01:03:00 +0800 Subject: [PATCH 134/257] Fix xhtml namespace. --- .../java/us/codecraft/webmagic/selector/Xpath2Selector.java | 1 + .../java/us/codecraft/webmagic/selector/XpathSelectorTest.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 1f1f0a572..9d5eef9b0 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -75,6 +75,7 @@ private void put(String prefix, String namespaceURI) { private XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); + put("xhtml", NamespaceConstant.XHTML); } @Override diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 32906b57a..166188361 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1376,7 +1376,7 @@ public void testXPath2() { @Test public void testXpath2Selector() { - Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); + Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href"); String select = xpath2Selector.select(html); Assert.assertEquals("http://www.oschina.net/", select); From d01f26333bb75561e80596932397512f83b177d4 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 00:21:17 +0800 Subject: [PATCH 135/257] Common the downloader status process and pass error information when onError --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 119 ++++------ .../selenium/SeleniumDownloader.java | 224 +++++++++--------- 4 files changed, 168 insertions(+), 183 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..2f9b11236 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public Html download(String url) { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public Html download(String url, String charset) { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e111..89b603894 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public Page download(Request request, Task task) { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0f..88b8237e2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +87,41 @@ public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); + + Page page = Page.fail(); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - return page; + page.setStatusCode(200); } + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.warn("download page {} error", request.getUrl(), e); } - - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293fc9..df601b4fe 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,112 +24,120 @@ * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { - - private volatile WebDriverPool webDriverPool; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - private static final String DRIVER_PHANTOMJS = "phantomjs"; - - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } - - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ - - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } +public class SeleniumDownloader extends AbstractDownloader implements Closeable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(); + try { + webDriver = webDriverPool.get(); + + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + onSuccess(request); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(request, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From acfbd7b883436f2088ead0e5db95bcc1445769a5 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 1 Oct 2022 10:37:09 +0800 Subject: [PATCH 136/257] =?UTF-8?q?Revert=20"Common=20the=20downloader=20s?= =?UTF-8?q?tatus=20process=20and=20pass=20error=20information=20when=20?= =?UTF-8?q?=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 119 ++++++---- .../selenium/SeleniumDownloader.java | 224 +++++++++--------- 4 files changed, 183 insertions(+), 168 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2f9b11236..c27292d09 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public Html download(String url) { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public Html download(String url, String charset) { protected void onSuccess(Request request) { } - protected void onError(Request request, Throwable e) { + protected void onError(Request request) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 89b603894..49217e111 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public Page download(Request request, Task task) { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()) { + if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88b8237e2..6055bdb0f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,70 +16,73 @@ * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + + private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default + private int retryNum; + private int threadNum; + public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - *

- * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + * + * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     *
+     * 
      *   var system = require('system');
      *   var url = system.args[1];
-     *
+     *   
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *
+     *   
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *
+     *   
      *       page.close();
      *       phantom.exit();
      *   });
-     *
+     *   
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - *

+ * * example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() - + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -87,41 +90,61 @@ public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - - Page page = Page.fail(); - try { - String content = getPage(request); - if (!content.contains("HTTP request failed")) { - page.setDownloadSuccess(true); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); + String content = getPage(request); + if (content.contains("HTTP request failed")) { + for (int i = 1; i <= getRetryNum(); i++) { + content = getPage(request); + if (!content.contains("HTTP request failed")) { + break; + } + } + if (content.contains("HTTP request failed")) { + //when failed + Page page = new Page(); page.setRequest(request); - page.setStatusCode(200); + return page; } - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.warn("download page {} error", request.getUrl(), e); } + + Page page = new Page(); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - // ignore + this.threadNum = threadNum; } - protected String getPage(Request request) throws Exception { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuilder builder = new StringBuilder(); - String line; - while ((line = br.readLine()) != null) { - builder.append(line).append("\n"); + protected String getPage(Request request) { + try { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuffer stringBuffer = new StringBuffer(); + String line; + while ((line = br.readLine()) != null) { + stringBuffer.append(line).append("\n"); + } + return stringBuffer.toString(); + } catch (IOException e) { + e.printStackTrace(); } - return builder.toString(); + + return null; + } + + public int getRetryNum() { + return retryNum; + } + + public PhantomJSDownloader setRetryNum(int retryNum) { + this.retryNum = retryNum; + return this; } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index df601b4fe..cce293fc9 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.AbstractDownloader; +import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,120 +24,112 @@ * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader extends AbstractDownloader implements Closeable { - - private volatile WebDriverPool webDriverPool; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - private static final String DRIVER_PHANTOMJS = "phantomjs"; - - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } - - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver = null; - Page page = Page.fail(); - try { - webDriver = webDriverPool.get(); - - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ - - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - page.setDownloadSuccess(true); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - onSuccess(request); - } catch (Exception e) { - logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); - } finally { - if (webDriver != null) { - webDriverPool.returnToPool(webDriver); - } - } - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } +public class SeleniumDownloader implements Downloader, Closeable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + webDriverPool.returnToPool(webDriver); + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From 7a62a6cb45b02466bc343ad7c7d1984e6f831594 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 17:33:11 +0800 Subject: [PATCH 137/257] =?UTF-8?q?Revert=20"Revert=20"Common=20the=20down?= =?UTF-8?q?loader=20status=20process=20and=20pass=20error=20information=20?= =?UTF-8?q?when=20=E2=80=A6""?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit acfbd7b883436f2088ead0e5db95bcc1445769a5. --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 119 ++++------ .../selenium/SeleniumDownloader.java | 224 +++++++++--------- 4 files changed, 168 insertions(+), 183 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..2f9b11236 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public Html download(String url) { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public Html download(String url, String charset) { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e111..89b603894 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public Page download(Request request, Task task) { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0f..88b8237e2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +87,41 @@ public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); + + Page page = Page.fail(); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - return page; + page.setStatusCode(200); } + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.warn("download page {} error", request.getUrl(), e); } - - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293fc9..df601b4fe 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,112 +24,120 @@ * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { - - private volatile WebDriverPool webDriverPool; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - private static final String DRIVER_PHANTOMJS = "phantomjs"; - - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } - - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ - - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } +public class SeleniumDownloader extends AbstractDownloader implements Closeable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(); + try { + webDriver = webDriverPool.get(); + + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + onSuccess(request); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(request, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From 5751681c9126e3c9ea1daeece2dc3eba10a281e7 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 17:34:02 +0800 Subject: [PATCH 138/257] Common the downloader status process and pass error information when onError --- .../us/codecraft/webmagic/samples/PhantomJSPageProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java index 99d5fa84e..ab5314073 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java @@ -36,7 +36,7 @@ public Site getSite() { } public static void main(String[] args) throws Exception { - PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); + PhantomJSDownloader phantomDownloader = new PhantomJSDownloader(); CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline(); From e7a7fbeeeb6ebc1e8f2bc152d1b142f4e6590a10 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 18:23:15 +0800 Subject: [PATCH 139/257] Enhance Jsoup could parse tr td tag directly --- .../selector/BaseElementSelector.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index bbc7217ab..b267d5ba9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.util.ArrayList; @@ -11,11 +12,24 @@ * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector, ElementSelector { + private Document parse(String text) { + if (text == null) { + return null; + } + + // Jsoup could not parse or tag directly + // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + if ((text.startsWith("") && text.endsWith("")) + || (text.startsWith("") && text.endsWith(""))) { + text = "" + text + "
"; + } + return Jsoup.parse(text); + } @Override public String select(String text) { if (text != null) { - return select(Jsoup.parse(text)); + return select(parse(text)); } return null; } @@ -23,7 +37,7 @@ public String select(String text) { @Override public List selectList(String text) { if (text != null) { - return selectList(Jsoup.parse(text)); + return selectList(parse(text)); } else { return new ArrayList(); } @@ -31,14 +45,14 @@ public List selectList(String text) { public Element selectElement(String text) { if (text != null) { - return selectElement(Jsoup.parse(text)); + return selectElement(parse(text)); } return null; } public List selectElements(String text) { if (text != null) { - return selectElements(Jsoup.parse(text)); + return selectElements(parse(text)); } else { return new ArrayList(); } From afc8309409ec495a9dab81fac0ba31ca094c5da7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 1 Oct 2022 23:34:01 +0800 Subject: [PATCH 140/257] Upgrade maven plugins and dependencies. --- pom.xml | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 3774b4b25..1beac05e2 100644 --- a/pom.xml +++ b/pom.xml @@ -9,31 +9,31 @@ UTF-8 1.8 1.8 - 3.18.1 - 1.4 + 3.23.1 + 1.5.0 4.4 2.11.0 3.12.0 - 1.2.75 - 3.0.10 + 2.0.14.graal + 3.0.13 31.1-jre 2.26 4.5.13 - 4.4.14 + 4.4.15 3.7.1 - 9.2.14.0 - 2.6.0 + 9.3.8.0 + 2.7.0 4.13.2 - 2.7.2 + 2.7.3 1.2.17 - 1.10.19 - 1.1.0 + 2.0.2-beta + 1.3.0 1.2.0 - 10.3 + 11.4 3.141.59 - 1.7.36 + 2.0.3 4.0.0.RELEASE - 0.3.2 + 0.3.5 webmagic-parent webmagic-parent @@ -222,7 +222,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.0.0-M3 + 3.1.0 enforce-maven @@ -296,7 +296,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.2.0 + 3.4.1 UTF-8 WebMagic ${project.version} @@ -325,7 +325,7 @@ org.apache.maven.plugins maven-release-plugin - 3.0.0-M1 + 3.0.0-M6 org.jacoco @@ -360,77 +360,77 @@ org.apache.maven.plugins maven-clean-plugin - 3.1.0 + 3.2.0 org.apache.maven.plugins maven-compiler-plugin - 3.8.1 + 3.10.1 org.apache.maven.plugins maven-deploy-plugin - 3.0.0-M1 + 3.0.0 org.apache.maven.plugins maven-install-plugin - 3.0.0-M1 + 3.0.1 org.apache.maven.plugins maven-jar-plugin - 3.2.0 + 3.3.0 org.apache.maven.plugins maven-jxr-plugin - 3.1.1 + 3.3.0 org.apache.maven.plugins maven-pmd-plugin - 3.14.0 + 3.19.0 org.apache.maven.plugins maven-resources-plugin - 3.2.0 + 3.3.0 org.apache.maven.plugins maven-site-plugin - 3.9.1 + 4.0.0-M3 org.apache.maven.plugins maven-surefire-plugin - 3.0.0-M5 + 3.0.0-M7 org.apache.maven.plugins maven-surefire-report-plugin - 3.0.0-M5 + 3.0.0-M7 org.codehaus.mojo taglist-maven-plugin - 2.4 + 3.0.0 org.jacoco jacoco-maven-plugin - 0.8.7 + 0.8.8 com.amashchenko.maven.plugin gitflow-maven-plugin - 1.15.0 + 1.18.0 com.github.spotbugs spotbugs-maven-plugin - 4.2.3 + 4.7.2.0 @@ -477,7 +477,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.2.1 package @@ -491,7 +491,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.9.1 + 3.4.1 package @@ -505,7 +505,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.0.1 verify @@ -518,7 +518,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6.8 + 1.6.13 true sonatype-nexus-staging From a255640c3d3d31815f39f0b0f9304ddbaa2a098e Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Mon, 3 Oct 2022 07:46:32 +0000 Subject: [PATCH 141/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-3038424 --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 7f7ceb228..f1ee73207 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.13.2.1 + 2.13.4 From 223cfc609db9678dc3f3c98f2db02187ef2039f3 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 6 Oct 2022 04:58:32 +0000 Subject: [PATCH 142/257] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJSOUP-2989728 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a2d39f4bc..aa695cfee 100644 --- a/pom.xml +++ b/pom.xml @@ -100,7 +100,7 @@ us.codecraft xsoup - 0.3.4 + 0.3.6 com.alibaba From 126c32ecd0ae68ec5a6eddf48486a210a18a4615 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 16 Oct 2022 07:20:22 +0800 Subject: [PATCH 143/257] Fix compatible issue. --- .../src/main/java/us/codecraft/webmagic/SpiderListener.java | 5 +++-- .../us/codecraft/webmagic/downloader/AbstractDownloader.java | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 8f10e0ef0..b55ef3d7f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -8,13 +8,14 @@ */ public interface SpiderListener { - public void onSuccess(Request request); + void onSuccess(Request request); /** * @deprecated Use {@link #onError(Request, Exception)} instead. */ @Deprecated - public void onError(Request request); + default void onError(Request request) { + } default void onError(Request request, Exception e) { this.onError(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2f9b11236..eb3a3a357 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -38,6 +38,11 @@ public Html download(String url, String charset) { protected void onSuccess(Request request) { } + @Deprecated + protected void onError(Request request) { + this.onError(request, null); + } + protected void onError(Request request, Throwable e) { } From a0ad7b808be7969f073f5d090ec2866d7fbd0a6e Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 18 Oct 2022 13:06:21 +0000 Subject: [PATCH 144/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-3038426 --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index f1ee73207..770314d6b 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.13.4 + 2.13.4.2 From 5f80e02abd7093f66d798c44d46ff55cf75bb4c4 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 19 Oct 2022 22:08:38 +0800 Subject: [PATCH 145/257] Interrupt current thread. --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 00091c90a..fd35f7724 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -326,6 +326,7 @@ public void run() { Thread.sleep(emptySleepTime); continue; } catch (InterruptedException e) { + Thread.currentThread().interrupt(); break; } } @@ -493,6 +494,7 @@ protected void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); + Thread.currentThread().interrupt(); } } From d2b2eed9df619d85d9cb6d808f7ef4bdc0c50d5f Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 19 Oct 2022 22:10:04 +0800 Subject: [PATCH 146/257] Pass the task to onSuccess & onError. --- .../webmagic/downloader/AbstractDownloader.java | 16 ++++++++++++++-- .../downloader/HttpClientDownloader.java | 4 ++-- .../webmagic/downloader/PhantomJSDownloader.java | 4 ++-- .../downloader/selenium/SeleniumDownloader.java | 4 ++-- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index eb3a3a357..ea3bbc590 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; /** @@ -35,15 +36,26 @@ public Html download(String url, String charset) { return (Html) page.getHtml(); } + @Deprecated protected void onSuccess(Request request) { } + /** + * @since 0.7.6 + */ + protected void onSuccess(Request request, Task task) { + this.onSuccess(request); + } + @Deprecated protected void onError(Request request) { - this.onError(request, null); } - protected void onError(Request request, Throwable e) { + /** + * @since 0.7.6 + */ + protected void onError(Request request, Task task, Throwable e) { + this.onError(request); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 89b603894..f138b2004 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -82,12 +82,12 @@ public Page download(Request request, Task task) { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request); + onSuccess(request, task); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request, task, e); return page; } finally { if (httpResponse != null) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88b8237e2..4f1eee8e6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -98,9 +98,9 @@ public Page download(Request request, Task task) { page.setRequest(request); page.setStatusCode(200); } - onSuccess(request); + onSuccess(request, task); } catch (Exception e) { - onError(request, e); + onError(request, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index df601b4fe..39b3bc914 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -111,10 +111,10 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - onSuccess(request); + onSuccess(request, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver); From 838c47f1f6a6274f0d18f432c00729957be0e90d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 23 Oct 2022 23:58:21 +0800 Subject: [PATCH 147/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 1beac05e2..23944598b 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 64b8013f2..fe1ff12cf 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e6e606825..289d2759f 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 741b081d8..fc5d9b761 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c5582c0b3..6b3af8344 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index d4d3efa18..893fc0b7b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index fe4ef6840..80b9eefe4 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index be3637692..36ded0005 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 From 5d5f3bf20e08e37b44b2807021b056cbc696a5f9 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 24 Oct 2022 00:17:21 +0800 Subject: [PATCH 148/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 226c851d7..25b0c3a88 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index fe1ff12cf..d2cf2cd3c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 289d2759f..cff8f74ea 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index fc5d9b761..335d47df4 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c726c07b1..c216ac6e5 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 893fc0b7b..f1951edf3 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0022a43b3..b9e8e435b 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 36ded0005..ff193caf7 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 From ac912e8f1fade5be3b0d8df521819f4b01ec6fba Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 12 Nov 2022 10:17:36 +0800 Subject: [PATCH 149/257] Revise QueueScheduler to support capacity-restricted. --- .../webmagic/scheduler/QueueScheduler.java | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index f9ad0e98f..8ea3ab195 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -16,11 +16,30 @@ */ public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { - private BlockingQueue queue = new LinkedBlockingQueue(); + private final BlockingQueue queue; + + public QueueScheduler() { + this.queue = new LinkedBlockingQueue<>(); + } + + /** + * Creates a {@code QueueScheduler} with the given (fixed) capacity. + * + * @param capacity the capacity of this queue, + * see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)} + * @since 0.8.0 + */ + public QueueScheduler(int capacity) { + this.queue = new LinkedBlockingQueue<>(capacity); + } @Override public void pushWhenNoDuplicate(Request request, Task task) { - queue.add(request); + try { + queue.put(request); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } @Override From 075b98291bbc920fb3d49957778e633bb9a3d205 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 12 Nov 2022 11:06:08 +0800 Subject: [PATCH 150/257] Return spider in setEmptySleepTime itself for chainning. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index fd35f7724..9f9201ee3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -333,9 +333,10 @@ public void run() { } } else { // wait until new url added, - if (waitNewUrl()) - //if interrupted + if (waitNewUrl()) { + //if interrupted break; + } continue; } } @@ -805,11 +806,13 @@ public Scheduler getScheduler() { * Set wait time when no url is polled.

* * @param emptySleepTime In MILLISECONDS. + * @return this */ - public void setEmptySleepTime(long emptySleepTime) { + public Spider setEmptySleepTime(long emptySleepTime) { if(emptySleepTime<=0){ throw new IllegalArgumentException("emptySleepTime should be more than zero!"); } this.emptySleepTime = emptySleepTime; + return this; } } From 4915431845ac035bc5b9379c809edfb4a0f19603 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 15 Nov 2022 22:48:02 +0800 Subject: [PATCH 151/257] Revise logging level from warn to info, as we have passed the exception to onError. refs #1094 --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f138b2004..72821f3c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -82,12 +82,16 @@ public Page download(Request request, Task task) { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); + onSuccess(request, task); logger.info("downloading page success {}", request.getUrl()); + return page; } catch (IOException e) { - logger.warn("download page {} error", request.getUrl(), e); + onError(request, task, e); + logger.info("download page {} error", request.getUrl(), e); + return page; } finally { if (httpResponse != null) { From e735e4e585f965ba3dabbd2faae3ad6665a4681b Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 20 Nov 2022 18:31:36 +0800 Subject: [PATCH 152/257] Log the remaining capacity. --- .../us/codecraft/webmagic/scheduler/QueueScheduler.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 8ea3ab195..04d5b36bc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,11 +1,10 @@ package us.codecraft.webmagic.scheduler; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; - import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
@@ -35,6 +34,8 @@ public QueueScheduler(int capacity) { @Override public void pushWhenNoDuplicate(Request request, Task task) { + logger.trace("Remaining capacity: {}", this.queue.remainingCapacity()); + try { queue.put(request); } catch (InterruptedException e) { From 64e6a9800a38ceb3e57f2f9f360b7212c2cc61c2 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:26:09 +0800 Subject: [PATCH 153/257] Add dead-lock note for QueueScheduler. --- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 04d5b36bc..19d3bc732 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -4,12 +4,17 @@ import java.util.concurrent.LinkedBlockingQueue; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * + * Note: if you use this {@link QueueScheduler} + * with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock + * when the queue is full. + * * @author code4crafter@gmail.com
* @since 0.1.0 */ From 888682863c6bfd6a33b0314dcd9b672a50c80e2f Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:30:07 +0800 Subject: [PATCH 154/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 25b0c3a88..215b483f8 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index d2cf2cd3c..997eb812c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index cff8f74ea..e2c0f741c 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 335d47df4..05d6100a6 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c216ac6e5..449fcf243 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index f1951edf3..b73f6fd27 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index b9e8e435b..3ec15f9af 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index ff193caf7..715d7731b 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 From a7a06936f07152469daeaa85fd67a0b737231aa3 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:33:30 +0800 Subject: [PATCH 155/257] Fix requireMavenVersion. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 215b483f8..8f928be6a 100644 --- a/pom.xml +++ b/pom.xml @@ -232,7 +232,7 @@ - 3.3.9 + 3.5.0 From 7d091def55709609c2894f619aaa8518a641769e Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:42:41 +0800 Subject: [PATCH 156/257] Upgrade fastjson, jruby, slf4j. --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 8f928be6a..68bf76d9c 100644 --- a/pom.xml +++ b/pom.xml @@ -14,14 +14,14 @@ 4.4 2.11.0 3.12.0 - 2.0.14.graal + 2.0.19.graal 3.0.13 31.1-jre 2.26 4.5.13 4.4.15 3.7.1 - 9.3.8.0 + 9.3.9.0 2.7.0 4.13.2 2.7.3 @@ -31,7 +31,7 @@ 1.2.0 11.4 3.141.59 - 2.0.3 + 2.0.4 4.0.0.RELEASE 0.3.5 From cda8467f9568ff5f532d155a37868c7ed0435654 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:49:43 +0800 Subject: [PATCH 157/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 68bf76d9c..cdf618211 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 997eb812c..7fe2ba6ff 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e2c0f741c..c6b70bce1 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 05d6100a6..daf0c7fdc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 449fcf243..e015567c2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b73f6fd27..732c23bd0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3ec15f9af..d1225dda2 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 715d7731b..a430772b6 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 From faf7e1559aa98a3bae6421fab1396257324e7273 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 20:31:43 +0800 Subject: [PATCH 158/257] Update README for the webmagic version. --- README-zh.md | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README-zh.md b/README-zh.md index 62b3c9a5e..c3c4b72ea 100644 --- a/README-zh.md +++ b/README-zh.md @@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` diff --git a/README.md b/README.md index 14aeac7b1..750a76841 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` From ef616c999e18bb9a7a351049749b3796d3abb977 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 27 Nov 2022 02:05:31 +0800 Subject: [PATCH 159/257] Fix warnings. --- .../webmagic/monitor/SpiderMonitor.java | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index b213dda94..50dbcaf1a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,21 +1,25 @@ package us.codecraft.webmagic.monitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.UrlUtils; - -import javax.management.*; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import javax.management.InstanceAlreadyExistsException; +import javax.management.JMException; +import javax.management.MBeanRegistrationException; +import javax.management.MBeanServer; +import javax.management.MalformedObjectNameException; +import javax.management.NotCompliantMBeanException; +import javax.management.ObjectName; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * @author code4crafer@gmail.com * @since 0.5.0 @@ -23,17 +27,13 @@ @Experimental public class SpiderMonitor { - private static SpiderMonitor INSTANCE = new SpiderMonitor(); - - private AtomicBoolean started = new AtomicBoolean(false); - - private Logger logger = LoggerFactory.getLogger(getClass()); + private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; @@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); + List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { @@ -90,7 +90,7 @@ public void onSuccess(Request request) { } @Override - public void onError(Request request) { + public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } @@ -109,7 +109,6 @@ public List getErrorUrls() { } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { -// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } From 80424b0bd7242ae3f92055baabcedbf6e4a5913b Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 5 Dec 2022 23:26:01 +0800 Subject: [PATCH 160/257] Replace List with Iterable, fixed #1099. --- .../src/main/java/us/codecraft/webmagic/Page.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..6370171df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -20,7 +20,7 @@ * {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
+ * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader @@ -52,7 +52,7 @@ public class Page { private List targetRequests = new ArrayList(); private String charset; - + public Page() { } @@ -108,7 +108,8 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ - public void setHtml(Html html) { + @Deprecated + public void setHtml(Html html) { this.html = html; } @@ -121,7 +122,7 @@ public List getTargetRequests() { * * @param requests requests */ - public void addTargetRequests(List requests) { + public void addTargetRequests(Iterable requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; @@ -137,7 +138,7 @@ public void addTargetRequests(List requests) { * @param requests requests * @param priority priority */ - public void addTargetRequests(List requests, long priority) { + public void addTargetRequests(Iterable requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; From a266df406ff4641d751c0607d203930fd0e7d7a5 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 20 Dec 2022 23:41:31 +0800 Subject: [PATCH 161/257] Add Site.defaultCharset. closes #1101. --- .../main/java/us/codecraft/webmagic/Site.java | 26 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 9 ++++--- .../java/us/codecraft/webmagic/SiteTest.java | 17 ++++++++++++ 3 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..230337756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -28,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -168,6 +170,30 @@ public String getCharset() { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 72821f3c1..bfd24f01c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; @@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); + logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 000000000..783b82ddc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + +} From 12ce86425f4f5b09be06e49f0d19e84dfa10c54b Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:48:58 +0800 Subject: [PATCH 162/257] =?UTF-8?q?BugFix:=20Jsoup=20=E5=92=8C=20HtmlClean?= =?UTF-8?q?er=20=E6=9E=84=E5=BB=BA=20Dom=20=E6=97=B6=EF=BC=8C=E8=8B=A5?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20table=20=E6=A0=87=E7=AD=BE=EF=BC=8C?= =?UTF-8?q?=E5=88=99=E6=97=A0=E6=B3=95=E6=AD=A3=E5=B8=B8=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=20tr=20=E5=92=8C=20td=20=E6=A0=87=E7=AD=BE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../selector/BaseElementSelector.java | 10 ++----- .../webmagic/utils/BaseSelectorUtils.java | 23 +++++++++++++++ .../webmagic/selector/Xpath2Selector.java | 28 +++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 19 +++++++++++++ 4 files changed, 60 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5ba9..6001767d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 000000000..04c0651c3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9b0..b63213b62 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -29,13 +30,14 @@ import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * Date: 13-4-21 + * Time: 上午9:39 */ public class Xpath2Selector implements Selector { @@ -111,14 +113,11 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -147,14 +146,11 @@ public String select(String text) { public List selectList(String text) { List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -179,4 +175,12 @@ public List selectList(String text) { } return results; } + + private Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 166188361..c2025e7c6 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,6 +11,9 @@ import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1385,6 +1388,22 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } + @Ignore("test parse or tag directly text = BaseSelectorUtils.preParse(text); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); return new DomSerializer(new CleanerProperties()).createDOM(tagNode); } + } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 8ac721934..4033fcfbd 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,12 +11,15 @@ import org.junit.Ignore; import org.junit.Test; +import org.w3c.dom.Node; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; +import javax.xml.transform.TransformerException; + /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ @@ -1388,23 +1391,6 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } - @Ignore("test parse
tag") + @Test + public void htmlCleanerParseTest() { + Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); + } + class RuoxiaPageProcessor implements PageProcessor { + @Override + public void process(Page page) { + List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); + for (Selectable node:nodes) { + String name = node.xpath("//td[3]/div/a[1]/text()").get(); + System.out.println(name); + } + } + } + @Ignore("take long time") @Test public void performanceTest() { From 08f4a4046b4cb13a81684533534a7d51640c3e04 Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:59:56 +0800 Subject: [PATCH 163/257] =?UTF-8?q?Update:=20=E6=8F=90=E4=BE=9B=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/selector/XpathSelectorTest.java | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index c2025e7c6..8ac721934 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1393,12 +1393,13 @@ public void testXpath2Selector() { public void htmlCleanerParseTest() { Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); } + class RuoxiaPageProcessor implements PageProcessor { @Override public void process(Page page) { - List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); - for (Selectable node:nodes) { - String name = node.xpath("//td[3]/div/a[1]/text()").get(); + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); + for (String item : items) { + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); System.out.println(name); } } @@ -1408,31 +1409,31 @@ public void process(Page page) { @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } - System.out.println("css "+(System.currentTimeMillis()-time)); + System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @@ -1444,54 +1445,54 @@ public void parserPerformanceTest() throws XPatherException { TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); } From 717931166a5ea6e0931f85cb3efc195982ca7b91 Mon Sep 17 00:00:00 2001 From: hooy <56918789+hooyantsing@users.noreply.github.com> Date: Sat, 11 Feb 2023 02:14:11 +0800 Subject: [PATCH 164/257] =?UTF-8?q?=E5=90=91=20webmagic-saxon=20=E7=BB=84?= =?UTF-8?q?=E4=BB=B6=E6=8F=90=E4=BE=9B=E8=8B=A5=E5=B9=B2=E6=96=B0=20API?= =?UTF-8?q?=EF=BC=8C=E6=9B=B4=E4=BC=98=E9=9B=85=E6=9B=B4=E7=81=B5=E6=B4=BB?= =?UTF-8?q?=E6=9B=B4=E5=BC=BA=E5=A4=A7=20(#1108)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。 --- .../webmagic/selector/JaxpSelectorUtils.java | 61 +++++++ .../webmagic/selector/NodeSelector.java | 32 ++++ .../webmagic/selector/Xpath2Selector.java | 155 ++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 57 +++++-- 4 files changed, 216 insertions(+), 89 deletions(-) create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java new file mode 100644 index 000000000..b03f3a2ab --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author hooy + */ +public final class JaxpSelectorUtils { + + private JaxpSelectorUtils() { + throw new RuntimeException("The util class cannot be instanced"); + } + + public static List NodeListToArrayList(NodeList nodes) { + List list = new ArrayList<>(nodes.getLength()); + for (int i = 0; i < nodes.getLength(); i++) { + list.add(nodes.item(i)); + } + return list; + } + + public static String nodeToString(Node node) throws TransformerException { + List before = Collections.singletonList(node); + List after = nodesToStrings(before); + if (after.size() > 0) { + return after.get(0); + } else { + return null; + } + } + + public static List nodesToStrings(List nodes) throws TransformerException { + List results = new ArrayList<>(nodes.size()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (Node node : nodes) { + if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { + results.add(node.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(node), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + return results; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java new file mode 100644 index 000000000..3e6339dda --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; + +import java.util.List; + +/** + * Selector(extractor) for html node.
+ * + * @author hooy
+ * @since 0.8.0 + */ +public interface NodeSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param node node + * @return result + */ + String select(Node node); + + /** + * Extract all results in text.
+ * + * @param node node + * @return results + */ + List selectList(Node node); + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index b63213b62..6c5d7b332 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,19 +1,10 @@ package us.codecraft.webmagic.selector; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; @@ -32,20 +23,22 @@ import net.sf.saxon.xpath.XPathEvaluator; import us.codecraft.webmagic.utils.BaseSelectorUtils; +import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; + /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* - * @author code4crafter@gmail.com
+ * @author code4crafter@gmail.com, hooy
* Date: 13-4-21 * Time: 上午9:39 */ -public class Xpath2Selector implements Selector { +public class Xpath2Selector implements Selector, NodeSelector { - private String xpathStr; + private final String xpathStr; private XPathExpression xPathExpression; - private Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; @@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) { } } + public static Xpath2Selector newInstance(String xpathStr) { + return new Xpath2Selector(xpathStr); + } + enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; - private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); - private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); - List prefixes = namespace2PrefixMap.get(namespaceURI); - if (prefixes == null) { - prefixes = new ArrayList(); - namespace2PrefixMap.put(namespaceURI, prefixes); - } + List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } - private XPath2NamespaceContext() { + XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); @@ -113,29 +106,18 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - if (nodeList.getLength() == 0) { - return null; - } - Node item = nodeList.item(0); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - return item.getTextContent(); - } else { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(item), xmlOutput); - return xmlOutput.getWriter().toString(); - } - } - return result.toString(); + Document doc = parse(text); + return select(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public String select(Node node) { + try { + return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } @@ -144,43 +126,72 @@ public String select(String text) { @Override public List selectList(String text) { - List results = new ArrayList(); try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - StreamResult xmlOutput = new StreamResult(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - for (int i = 0; i < nodeList.getLength(); i++) { - Node item = nodeList.item(i); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - results.add(item.getTextContent()); - } else { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(item), xmlOutput); - results.add(xmlOutput.getWriter().toString()); - } - } - } else { - results.add(result.toString()); - } + Document doc = parse(text); + return selectList(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + List nodes = NodeListToArrayList(result); + return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } - return results; + return null; } - private Document parse(String text) throws ParserConfigurationException { + public Node selectNode(String text) { + try { + Document doc = parse(text); + return selectNode(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(Node node) { + try { + return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(String text) { + try { + Document doc = parse(text); + return selectNodes(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + return NodeListToArrayList(result); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + protected static Document parse(String text) throws ParserConfigurationException { // HtmlCleaner could not parse
tag") - @Test - public void htmlCleanerParseTest() { - Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); - } - - class RuoxiaPageProcessor implements PageProcessor { - @Override - public void process(Page page) { - List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); - for (String item : items) { - String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); - System.out.println(name); - } - } - } - @Ignore("take long time") @Test public void performanceTest() { @@ -1496,4 +1482,41 @@ public void parserPerformanceTest() throws XPatherException { } + /** + * New api test + * + * @author hooy + * @since 8.0 + */ + private String rank = "

点击榜

排名分类书名/最新章节作者推荐更新时间
1.现实
0
11-24 22:32
2.架空
1047
03-04 14:44
3.现实
0
07-20 09:06
4.豪门
0
12-03 09:12
5.现实
0
02-01 21:12
6.玄奇
3455
02-28 12:31
7.玄奇
20614
03-31 12:37
8.复仇
55
06-03 11:43
9.穿越
0
10-27 18:50
10.宫斗
320
10-31 13:58
11.宫斗
6268
07-12 20:23
12.现实
0
01-18 23:00
13.婚恋
0
12-14 20:50
14.修真
0
02-03 23:40
15.豪门
0
11-06 23:38
16.穿越
191
12-02 23:37
17.穿越
412
10-13 22:39
18.豪门
635
07-01 13:15
19.架空
144
06-18 09:35
20.宅斗
1032
08-15 19:03
21.宫斗
0
09-30 20:32
22.豪门
0
06-05 11:31
23.重生
80
11-25 19:56
24.异世
68
01-12 10:06
25.豪门
0
05-29 18:46
26.婚恋
2778
11-04 17:48
27.玄奇
207
12-06 16:57
28.穿越
260
01-04 23:26
29.豪门
0
12-07 21:39
30.架空
1127
06-06 17:28
31.穿越
113
09-13 09:06
32.架空
597
02-14 18:47
33.玄奇
528
06-04 22:04
34.穿越
328
06-06 22:09
35.架空
539
05-24 14:42
36.架空
0
03-05 23:27
37.穿越
3215
08-21 16:38
38.宫斗
905
08-04 20:24
39.玄奇
1328
07-25 10:58
40.穿越
203
01-27 20:53
41.宫斗
407
08-31 09:03
42.宅斗
16
05-03 17:38
43.豪门
0
11-10 08:00
44.婚恋
0
07-12 21:37
45.架空
0
06-23 21:02
46.玄奇
1382
05-31 20:36
47.重生
334
07-16 19:19
48.婚恋
505
11-01 16:42
49.婚恋
0
10-19 18:32
50.豪门
540
09-19 19:18
51.婚恋
226
03-18 13:09
52.穿越
1026
03-08 16:28
53.重生
304
02-19 10:25
54.玄奇
2617
02-15 20:57
55.穿越
199
09-04 19:43
56.同人
768
07-19 20:00
57.宅斗
0
02-13 18:13
58.豪门
0
11-12 22:23
59.架空
0
07-28 23:42
60.婚恋
0
02-03 23:09
61.豪门
285
01-07 19:21
62.重生
654
10-12 18:16
63.异能
617
06-18 20:23
64.宫斗
27
06-02 21:05
65.种田
206
08-31 19:23
66.宅斗
2444
08-19 15:51
67.宅斗
818
08-07 23:38
68.现代
0
12-23 17:02
69.玄奇
0
07-23 12:00
70.婚恋
0
11-01 16:43
71.豪门
0
09-12 00:01
72.架空
0
04-27 22:42
73.豪门
0
04-19 13:55
74.异能
62
07-30 00:00
75.穿越
1307
07-20 16:41
76.玄奇
12820
07-15 23:46
77.架空
828
06-06 17:54
78.宅斗
985
05-20 23:53
79.玄奇
4960
04-12 15:58
80.玄奇
245
03-02 23:11
81.宅斗
34
12-21 10:11
82.宅斗
1411
07-21 00:00
83.现代
0
07-31 10:10
84.玄奇
0
06-18 13:53
85.架空
0
12-03 23:41
86.玄奇
0
11-28 22:13
87.豪门
0
11-07 22:48
88.婚恋
0
08-29 23:15
89.种田
1831
08-21 16:38
90.豪门
0
07-11 21:25
91.豪门
0
06-13 15:37
92.豪门
0
05-07 22:10
93.豪门
0
02-28 00:01
94.豪门
304
12-16 07:30
95.婚恋
669
11-07 18:16
96.仙侠
54
09-25 19:51
97.豪门
655
08-31 13:02
98.现实
374
06-29 09:55
99.穿越
373
06-19 18:07
100.婚恋
159
06-04 21:05
"; + + @Test + public void testStringAPI() { + // testAPI: selectList(String) -> selectList(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); + Assert.assertSame(100, items.size()); + // testAPI: select(String) -> select(Node) + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testNodeAPI() { + // testAPI: selectNodes(String) -> selectNodes(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); + Assert.assertSame(100, items.size()); + // testAPI: selectNode(Node) + Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); + String name = new Xpath2Selector("./text()").select(item); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testUtilAPI() throws TransformerException { + Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); + // testAPI: nodeToString(Node) -> nodesToStrings(List) + String name = JaxpSelectorUtils.nodeToString(item); + Assert.assertEquals("深宫安容传", name); + } + } From 244ade7b4c88d21bd676a5ea128a8ac2a8f53456 Mon Sep 17 00:00:00 2001 From: Tanky-Zhang <48041180+Tanky-Zhang@users.noreply.github.com> Date: Wed, 22 Mar 2023 22:25:51 +0800 Subject: [PATCH 165/257] feat:update host verify (#1112) --- .../downloader/HttpClientGenerator.java | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f1085..167a5e1c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,16 +1,5 @@ package us.codecraft.webmagic.downloader; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; - -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; @@ -22,28 +11,32 @@ import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.Site; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private transient Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; @@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { SSLContext sslContext = createIgnoreVerifySSL(); String[] supportedProtocols; if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; } else { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; } logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, - new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { - logger.error("ssl connection fail", e); - } catch (NoSuchAlgorithmException e) { + //不进行主机校验 + (host, sslSession) -> true); // 优先绕过安全证书 + } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); - } + } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 @@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() { }; SSLContext sc = SSLContext.getInstance("TLS"); - sc.init(null, new TrustManager[] { trustManager }, null); + sc.init(null, new TrustManager[]{trustManager}, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); From aeb0e89f0250c3c8046dbdf9866d6a5b82f39ebf Mon Sep 17 00:00:00 2001 From: Snyk bot Date: Tue, 28 Mar 2023 00:30:03 +0800 Subject: [PATCH 166/257] fix: pom.xml to reduce vulnerabilities (#1114) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-3369748 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 68bf76d9c..52db5790e 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 4.4.15 3.7.1 9.3.9.0 - 2.7.0 + 2.8.0 4.13.2 2.7.3 1.2.17 From 9e59b37834f9577e29ed8fc9cdde66ca386836e2 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Tue, 20 Jun 2023 13:39:45 +0800 Subject: [PATCH 167/257] fix: pom.xml to reduce vulnerabilities (#1119) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMGOOGLEGUAVA-5710356 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 52db5790e..c4c52104e 100644 --- a/pom.xml +++ b/pom.xml @@ -16,7 +16,7 @@ 3.12.0 2.0.19.graal 3.0.13 - 31.1-jre + 32.0.0-jre 2.26 4.5.13 4.4.15 From 58fd08bcf83909fe713f9a5db24d30b8c30a5824 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 00:27:30 +0800 Subject: [PATCH 168/257] Expose Request to ProxyProvider. --- .../downloader/HttpClientDownloader.java | 2 +- .../webmagic/proxy/ProxyProvider.java | 19 ++++++++++++++++++- .../webmagic/proxy/SimpleProxyProvider.java | 3 ++- .../proxy/SimpleProxyProviderTest.java | 2 +- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bfd24f01c..2f3ef58ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -77,7 +77,7 @@ public Page download(Request request, Task task) { } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); - Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; + Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed42..8eab4d6de 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** @@ -23,7 +24,23 @@ public interface ProxyProvider { * Get a proxy for task by some strategy. * @param task the download task * @return proxy + * @deprecated Use {@link #getProxy(Request, Task)} instead. */ - Proxy getProxy(Task task); + @Deprecated + default Proxy getProxy(Task task) { + throw new UnsupportedOperationException(); + } + + /** + * Returns a proxy for the request. + * + * @param request the request + * @param task the download task + * @return proxy + * @since 0.9.0 + */ + default Proxy getProxy(Request request, Task task) { + return this.getProxy(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a88c..f4c3f73bb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) { } @Override - public Proxy getProxy(Task task) { + public Proxy getProxy(Request request, Task task) { return proxies.get(incrForLoop()); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 6495b16bf..8fda56ea9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -20,7 +20,7 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(TASK); + Proxy proxy = proxyProvider.getProxy(null, TASK); assertThat(proxy).isEqualTo(originProxy1); proxy = proxyProvider.getProxy(TASK); assertThat(proxy).isEqualTo(originProxy2); From a5fb4e041476ecd93346b3bc41354d5b29c6ae13 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 10:40:37 +0800 Subject: [PATCH 169/257] Upgrade dependencies. --- pom.xml | 2 +- webmagic-extension/pom.xml | 5 +++++ webmagic-samples/pom.xml | 8 ++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index cdf618211..c81f4c557 100644 --- a/pom.xml +++ b/pom.xml @@ -124,7 +124,7 @@ us.codecraft xsoup - 0.3.6 + 0.3.7 com.alibaba diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index daf0c7fdc..8c7fdb3d2 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -14,6 +14,11 @@ redis.clients jedis + + org.assertj + assertj-core + test + com.google.guava guava diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index e015567c2..9e1623018 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,22 +27,22 @@ org.mapdb mapdb - 3.0.8 + 3.0.9 com.fasterxml.jackson.core jackson-core - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-annotations - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-databind - 2.13.4.2 + 2.15.2 From ad010927f6acb5c605e4befe269076629d8d4357 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 10:40:46 +0800 Subject: [PATCH 170/257] Fix test. --- .../webmagic/proxy/SimpleProxyProviderTest.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 8fda56ea9..e9325a7a7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.proxy; import org.junit.Test; +import org.mockito.Mockito; + +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; @@ -20,11 +23,12 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(null, TASK); + Request request = Mockito.mock(Request.class); + Proxy proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy2); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); } } From 3688226e327266cb3cfd1a1e3777ad94ad68d6f5 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 11:16:41 +0800 Subject: [PATCH 171/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index c81f4c557..35c0c9bd0 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 7fe2ba6ff..b4feb1671 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c6b70bce1..a0a5ffb48 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 8c7fdb3d2..7cf0aa617 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 9e1623018..e42e1fcd8 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 732c23bd0..c5238760b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d1225dda2..0019ea3c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a430772b6..63682001f 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 From 6e8d1301132bf258a1dd1317355b9ef0f8316802 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 11:23:44 +0800 Subject: [PATCH 172/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 9e4c45077..e35e2a486 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b4feb1671..7f873cc74 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index a0a5ffb48..f8b1ab1d7 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.0 + 0.9.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 7cf0aa617..d9428a258 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index e42e1fcd8..10d8d8f6b 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index c5238760b..0e2fda439 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0019ea3c8..3947493ef 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 63682001f..6577d094f 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.0 + 0.9.1-SNAPSHOT 4.0.0 From 028f1ed4a45c29a517b30656324f06815ad23654 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Tue, 22 Aug 2023 13:02:24 +0800 Subject: [PATCH 173/257] fix: pom.xml to reduce vulnerabilities (#1126) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-NETSOURCEFORGEHTMLCLEANER-5710357 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 9e4c45077..12413b85e 100644 --- a/pom.xml +++ b/pom.xml @@ -17,7 +17,7 @@ 2.0.19.graal 3.0.13 32.0.0-jre - 2.26 + 2.29 4.5.13 4.4.15 3.7.1 From 238814acb424619f1b6032d762e83b062c665a15 Mon Sep 17 00:00:00 2001 From: dack-su <144129789+dack-su@users.noreply.github.com> Date: Thu, 7 Sep 2023 07:20:11 +0800 Subject: [PATCH 174/257] update net.sourceforge.htmlcleaner:htmlcleaner 2.26 to 2.29 (#1127) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e35e2a486..8c0678d67 100644 --- a/pom.xml +++ b/pom.xml @@ -17,7 +17,7 @@ 2.0.19.graal 3.0.13 32.0.0-jre - 2.26 + 2.29 4.5.13 4.4.15 3.7.1 From 0b62461b7a8671caa01070cc422768eb959e4440 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 10 Sep 2023 16:25:10 +0800 Subject: [PATCH 175/257] Upgrade mapdb from 3.0.9 to 3.0.10. --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 10d8d8f6b..a0db091f3 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,7 +27,7 @@ org.mapdb mapdb - 3.0.9 + 3.0.10 com.fasterxml.jackson.core From 07e83a421b939bd711a09d0a9918a968c592eb8f Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 10 Sep 2023 16:35:32 +0800 Subject: [PATCH 176/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 8c0678d67..479959a7d 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 7f873cc74..983d309b1 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index f8b1ab1d7..21fa00128 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.1-SNAPSHOT + 0.9.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index d9428a258..008d00443 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a0db091f3..783a5e9ea 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 0e2fda439..6982bc22e 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3947493ef..30984e39d 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 6577d094f..489bbbc95 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1-SNAPSHOT + 0.9.1 4.0.0 From 8c008563ff1d89b3f327afa12deaf2f8abbe7202 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 10 Sep 2023 17:06:55 +0800 Subject: [PATCH 177/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 479959a7d..6fea73494 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 983d309b1..9838e1f5f 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 21fa00128..2c5732c6a 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 008d00443..94178bf8f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 783a5e9ea..57b9cbac0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 6982bc22e..138c050cb 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 30984e39d..14e495504 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 489bbbc95..c37cbe3de 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 From c0d38a6f1a8406bd0723838a8742db118e4f6463 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 23 Sep 2023 12:04:56 +0800 Subject: [PATCH 178/257] Upgrade maven-fluido-skin from 1.9 to 1.11.1. --- src/site/site.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/site/site.xml b/src/site/site.xml index d2d5caacd..b78651960 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -5,7 +5,7 @@ org.apache.maven.skins maven-fluido-skin - 1.9 + 1.11.1 From 73f60f809e30d56dec130811407814595a09a103 Mon Sep 17 00:00:00 2001 From: Maciej Walkowiak Date: Tue, 24 Oct 2023 01:50:14 +0200 Subject: [PATCH 179/257] Fix typos (#1131) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 750a76841..89536c927 100644 --- a/README.md +++ b/README.md @@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) There are more examples in `webmagic-samples` package. -### Lisence: +### License: -Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) +Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0) ### Thanks: From eda3be9432663951f42a96bf790987bb7dd6c530 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 6 Nov 2023 22:44:53 +0800 Subject: [PATCH 180/257] Fix log format. --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2f3ef58ed..92c770236 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -85,13 +85,13 @@ public Page download(Request request, Task task) { page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request, task); - logger.info("downloading page success {}", request.getUrl()); + logger.info("download page success {}", request.getUrl()); return page; } catch (IOException e) { onError(request, task, e); - logger.info("download page {} error", request.getUrl(), e); + logger.info("download page error {}", request.getUrl(), e); return page; } finally { From 19288e9c11551e7b0e3a2533183942405f0fa521 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Fri, 17 Nov 2023 22:12:43 +0800 Subject: [PATCH 181/257] fix: pom.xml to reduce vulnerabilities (#1134) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGSELENIUMHQSELENIUM-6062318 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 479959a7d..cf7d81612 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ 1.3.0 1.2.0 11.4 - 3.141.59 + 4.14.1 2.0.4 4.0.0.RELEASE 0.3.5 From 67644de3d9540611ef494f4bb595688a47a541a6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 20 Nov 2023 18:26:45 +0800 Subject: [PATCH 182/257] Expose Page to onSuccess & onError. --- .../main/java/us/codecraft/webmagic/Page.java | 21 ++++++++++- .../downloader/AbstractDownloader.java | 36 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 10 +++--- .../downloader/PhantomJSDownloader.java | 6 ++-- .../selenium/SeleniumDownloader.java | 6 ++-- 5 files changed, 67 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 6370171df..e48d4cb00 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -56,8 +56,27 @@ public class Page { public Page() { } - public static Page fail(){ + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. + * + * @return the page. + * @deprecated Use {@link #fail(Request)} instead. + */ + @Deprecated + public static Page fail() { + return fail(null); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, + * and {@link #request} is specified. + * + * @return the page. + * @since 0.10.0 + */ + public static Page fail(Request request){ Page page = new Page(); + page.setRequest(request); page.setDownloadSuccess(false); return page; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index ea3bbc590..6a400e321 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -36,26 +36,62 @@ public Html download(String url, String charset) { return (Html) page.getHtml(); } + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onSuccess(Page, Task)} instead. + */ @Deprecated protected void onSuccess(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. * @since 0.7.6 + * @deprecated Use {@link #onSuccess(Page, Task)} instead. */ + @Deprecated protected void onSuccess(Request request, Task task) { this.onSuccess(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @since 0.10.0 + */ + protected void onSuccess(Page page, Task task) { + this.onSuccess(page.getRequest(), task); + } + + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. + */ @Deprecated protected void onError(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. + * @param e the exception. * @since 0.7.6 + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. */ + @Deprecated protected void onError(Request request, Task task, Throwable e) { this.onError(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @param e the exception. + * @since 0.10.0 + */ + protected void onError(Page page, Task task, Throwable e) { + this.onError(page.getRequest(), task, e); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 92c770236..80e7b72c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -79,19 +79,19 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(); + Page page = Page.fail(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request, task); - logger.info("download page success {}", request.getUrl()); + onSuccess(page, task); + logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { - onError(request, task, e); - logger.info("download page error {}", request.getUrl(), e); + onError(page, task, e); + logger.info("download page {} error", request.getUrl(), e); return page; } finally { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 4f1eee8e6..31dfca75a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -88,7 +88,7 @@ public Page download(Request request, Task task) { logger.info("downloading page: " + request.getUrl()); } - Page page = Page.fail(); + Page page = Page.fail(request); try { String content = getPage(request); if (!content.contains("HTTP request failed")) { @@ -98,9 +98,9 @@ public Page download(Request request, Task task) { page.setRequest(request); page.setStatusCode(200); } - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { - onError(request, task, e); + onError(page, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 39b3bc914..874f8aef7 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -74,7 +74,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) { public Page download(Request request, Task task) { checkInit(); WebDriver webDriver = null; - Page page = Page.fail(); + Page page = Page.fail(request); try { webDriver = webDriverPool.get(); @@ -111,10 +111,10 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, task, e); + onError(page, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver); From 622ed5a17f98ee1625222452096741ebe16dfe85 Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Fri, 24 Nov 2023 10:07:04 +0530 Subject: [PATCH 183/257] Refactor compareLong method using Long.compare, corrected the local variable name (#1136) --- .../java/us/codecraft/webmagic/utils/NumberUtils.java | 8 +------- .../java/us/codecraft/webmagic/utils/WMCollections.java | 6 +++--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java index 55e185105..fbeb8ed3b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -6,12 +6,6 @@ public abstract class NumberUtils { public static int compareLong(long o1, long o2) { - if (o1 < o2) { - return -1; - } else if (o1 == o2) { - return 0; - } else { - return 1; - } + return Long.compare(o1, o2); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java index 23e1644ce..a2ca5afd0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -21,10 +21,10 @@ public static Set newHashSet(T... t){ } public static List newArrayList(T... t){ - List set = new ArrayList(t.length); + List list = new ArrayList(t.length); for (T t1 : t) { - set.add(t1); + list.add(t1); } - return set; + return list; } } From a9111040763f1c078e67e5d4d2434fce9992ed5a Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Fri, 24 Nov 2023 17:39:32 +0530 Subject: [PATCH 184/257] Refactored to remove multiple calls of getSourceTexts() api (#1137) --- .../webmagic/selector/AbstractSelectable.java | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index 8775af108..1fb35f1a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; + import org.apache.commons.collections4.CollectionUtils; /** @@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) { @Override public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } + List sourceTexts = all(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); + } + return null; + } @Override @@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) { } public String getFirstSourceText() { - if (getSourceTexts() != null && getSourceTexts().size() > 0) { - return getSourceTexts().get(0); + List sourceTexts = getSourceTexts(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); } return null; } @@ -104,6 +107,6 @@ public String toString() { @Override public boolean match() { - return getSourceTexts() != null && getSourceTexts().size() > 0; + return CollectionUtils.isNotEmpty(getSourceTexts()); } } From 7c20290ce4be0c642e9bd02edb82d235e39b761c Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Sun, 26 Nov 2023 08:26:06 +0530 Subject: [PATCH 185/257] Refactor addTargetRequests method to eliminate redundant code. (#1138) --- .../main/java/us/codecraft/webmagic/Page.java | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e48d4cb00..17f8b03dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,7 +49,7 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; @@ -142,13 +142,7 @@ public List getTargetRequests() { * @param requests requests */ public void addTargetRequests(Iterable requests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s)); - } + addTargetRequests(requests, 0); // Default priority is 0 } /** @@ -158,13 +152,32 @@ public void addTargetRequests(Iterable requests) { * @param priority priority */ public void addTargetRequests(Iterable requests, long priority) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s).setPriority(priority)); + if(requests == null) { + return; + } + + for (String req : requests) { + addRequestIfValid(req, priority); + } + } + + /** + * Helper method to add a request if it's valid. + * + * @param url URL to add + * @param priority Priority for the URL + */ + private void addRequestIfValid(String url, long priority) { + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; + } + + String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } + targetRequests.add(req); } /** From 73dd2ebbac6f7712c59155027906f7441d693935 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:28:05 +0800 Subject: [PATCH 186/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 6fea73494..700d5c426 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9838e1f5f..021a83f3e 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 2c5732c6a..4109c49fc 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 94178bf8f..b47ae3614 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 57b9cbac0..08e70c161 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 138c050cb..4a2b358d0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 14e495504..92914655a 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index c37cbe3de..5c2e50b2a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 From 1e5c6488ff792456c9abd17f048e30d14cc5ae5b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:48:14 +0800 Subject: [PATCH 187/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 5f1bdf901..c90394a30 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 021a83f3e..a6eff4063 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 4109c49fc..f9a2f50c8 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b47ae3614..e68385967 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 08e70c161..7361216f2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 4a2b358d0..bff1de3f6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 92914655a..c81c5613b 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 5c2e50b2a..8381c0275 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1-SNAPSHOT 4.0.0 From 7ededbea1a3b040c4429293e10a30996ccf9caf0 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:56:14 +0800 Subject: [PATCH 188/257] Fix javadoc. --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 17f8b03dd..b4c161a9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -71,6 +71,7 @@ public static Page fail() { * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, * and {@link #request} is specified. * + * @param request the {@link Request}. * @return the page. * @since 0.10.0 */ From 4281f82352505021a36fab064a1bfff9bbb55c0d Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Thu, 11 Jan 2024 08:53:43 +0800 Subject: [PATCH 189/257] fix: webmagic-samples/pom.xml to reduce vulnerabilities (#1142) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJETBRAINSKOTLIN-2393744 - https://snyk.io/vuln/SNYK-JAVA-ORGJETBRAINSKOTLIN-2628385 Co-authored-by: snyk-bot --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 08e70c161..906606fba 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,7 +27,7 @@ org.mapdb mapdb - 3.0.10 + 3.1.0 com.fasterxml.jackson.core From 65fd8f3779580f925e0ec54e1d8037859a6ce303 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Mon, 22 Jan 2024 12:50:01 +0800 Subject: [PATCH 190/257] fix: pom.xml to reduce vulnerabilities (#1146) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMJAYWAYJSONPATH-6140361 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 5f1bdf901..2fb56e7be 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 4.4.15 3.7.1 9.3.9.0 - 2.8.0 + 2.9.0 4.13.2 2.7.3 1.2.17 From 95d1f4415039942d8c6799d172d710afb2102dd2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 18:33:00 +0800 Subject: [PATCH 191/257] Optimize Request#extras, fix #1148. --- .../java/us/codecraft/webmagic/Request.java | 18 +++++------ .../us/codecraft/webmagic/RequestTest.java | 32 +++++++++++++++++-- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc286192..a59b20637 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,13 +1,14 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.downloader.Downloader; -import us.codecraft.webmagic.model.HttpRequestBody; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.model.HttpRequestBody; +import us.codecraft.webmagic.utils.Experimental; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -35,7 +36,7 @@ public class Request implements Serializable { /** * Store additional information in extras. */ - private Map extras; + private Map extras = new HashMap<>(); /** * cookies for current url, if not set use Site's cookies @@ -93,9 +94,6 @@ public T getExtra(String key) { } public Request putExtra(String key, T value) { - if (extras == null) { - extras = new HashMap(); - } extras.put(key, value); return this; } @@ -105,11 +103,11 @@ public String getUrl() { } public Map getExtras() { - return extras; + return Collections.unmodifiableMap(extras); } public Request setExtras(Map extras) { - this.extras = extras; + this.extras.putAll(extras); return this; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java index c7e4943d9..b8f699a6f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Map; + import org.junit.Test; -import us.codecraft.webmagic.utils.HttpConstant; -import static org.assertj.core.api.Assertions.assertThat; +import us.codecraft.webmagic.utils.HttpConstant; /** * @author code4crafter@gmail.com @@ -22,4 +26,28 @@ public void testEqualsAndHashCode() throws Exception { assertThat(requestA).isNotEqualTo(requestB); assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); } + + @Test + public void testSetExtras() { + Request request = new Request(); + Map extras = Collections.singletonMap("a", "1"); + request.setExtras(extras); + request.putExtra("b", "2"); + assertThat(request.getExtra("a")).isEqualTo("1"); + assertThat(request.getExtra("b")).isEqualTo("2"); + } + + @Test + public void testGetExtras() { + Request request = new Request(); + request.putExtra("a", "1"); + assertThat(request.getExtras()).containsEntry("a", "1"); + } + + @Test(expected = UnsupportedOperationException.class) + public void testGetExtrasShouldBeUnmodifiable() { + Request request = new Request(); + request.getExtras().put("a", "1"); + } + } From e4ab6e27e4fd127d1feeea862a7ff10eb91c2ae7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 18:35:25 +0800 Subject: [PATCH 192/257] Optimize Request#extras, refs #1148. --- .../us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java | 2 +- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 46d47e5a5..7abe5bfad 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -102,7 +102,7 @@ private String getZsetMinusPriorityKey(Task task) { } private void setExtrasInItem(Jedis jedis,Request request, Task task) { - if (request.getExtras() != null) { + if (!request.getExtras().isEmpty()) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 19e831321..8d61bea3b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -84,7 +84,7 @@ private boolean checkForAdditionalInfo(Request request) { return true; } - if (request.getExtras() != null && !request.getExtras().isEmpty()) { + if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { From 22a60df6aa06d8c73642b2d4c9f839d74bbb7f0f Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 3 Mar 2024 20:02:09 +0800 Subject: [PATCH 193/257] Fix build for selenium upgrading from 3.141.59 to 4.14.1, refs #1134. --- .../downloader/selenium/WebDriverPool.java | 33 ++++++++++--------- .../webmagic/downloader/SeleniumTest.java | 13 ++++---- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd039..b96d2894b 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,15 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; @@ -22,6 +12,18 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -58,7 +60,7 @@ class WebDriverPool { * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver - * + * * @author bob.li.0718@gmail.com * @throws IOException */ @@ -73,7 +75,6 @@ public void configure() throws IOException { // Prepare capabilities sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); @@ -134,9 +135,9 @@ public void configure() throws IOException { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { - mDriver = new FirefoxDriver(sCaps); + mDriver = new FirefoxDriver(new FirefoxOptions(sCaps)); } else if (driver.equals(DRIVER_CHROME)) { - mDriver = new ChromeDriver(sCaps); + mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps)); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } @@ -144,7 +145,7 @@ public void configure() throws IOException { /** * check whether input is a valid URL - * + * * @author bob.li.0718@gmail.com * @param urlString urlString * @return true means yes, otherwise no. @@ -178,7 +179,7 @@ public WebDriverPool() { } /** - * + * * @return * @throws InterruptedException */ diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index b7bcd80b3..43ac84b5a 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -1,17 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -29,10 +30,10 @@ public void testSelenium() { Map preferences = new HashMap(); preferences.put("profile.default_content_settings", contentSettings); - DesiredCapabilities caps = DesiredCapabilities.chrome(); + DesiredCapabilities caps = new DesiredCapabilities(); caps.setCapability("chrome.prefs", preferences); caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); - WebDriver webDriver = new ChromeDriver(caps); + WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps)); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML")); From 7f8607b88130daf1814abf8a6792d6433d23bf57 Mon Sep 17 00:00:00 2001 From: Ch3n4y Date: Thu, 7 Mar 2024 08:41:26 +0800 Subject: [PATCH 194/257] update com.fasterxml.jackson.core:jackson-databind 2.15.2 to 2.16.0 (#1149) --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 7361216f2..ad7fae4ce 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -42,7 +42,7 @@ com.fasterxml.jackson.core jackson-databind - 2.15.2 + 2.16.0 From 80842d72db35b22d93e34d7773251c9bec9a9de9 Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:22:30 -0300 Subject: [PATCH 195/257] Added test cases for improving line and branch coverage (#1150) * Initial Commit * Assignment 1 Submission --- .../java/us/codecraft/webmagic/SiteTest.java | 23 ++++++++ .../downloader/HttpClientDownloaderTest.java | 9 +++ .../webmagic/selector/AndSelectorTest.java | 59 +++++++++++++++++++ .../webmagic/selector/CssSelectorTest.java | 39 ++++++++++++ .../webmagic/selector/OrSelectorTest.java | 44 ++++++++++++++ 5 files changed, 174 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java index 783b82ddc..47c4fcc14 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.junit.Test; @@ -14,4 +18,23 @@ public void test() { assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); } + @Test + public void addCookieTest(){ + Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + site.addCookie("cookieDefault","cookie-webmagicDefault"); + String firstDomain="example.com"; + String secondDomain="exampleCopy.com"; + site.addCookie(firstDomain, "cookie", "cookie-webmagic"); + site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy"); + site.addCookie(secondDomain, "cookie", "cookie-webmagic"); + Map> allCookies = site.getAllCookies(); + List domains=new ArrayList<>(); + for(String key : allCookies.keySet()){ + domains.add(key); + } + assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie")); + assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy")); + assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie")); + assertEquals(2, domains.size()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 780ca7529..1ff7b4dd7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -40,6 +40,7 @@ import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -333,5 +334,13 @@ public void run() throws Exception { }); } + @Test + public void test_no_task_download(){ + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null)); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java new file mode 100644 index 000000000..59885ebd1 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class AndSelectorTest { + + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item1']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals("
\n Item 1\n
", result.get(0)); + } + + @Test + public void testSelectList_NoResults() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals(0, result.size()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java new file mode 100644 index 000000000..8b1ace903 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; + +import java.util.List; +import static org.junit.Assert.*; + +public class CssSelectorTest { + + @Test + public void testSelectElement() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + Element resultElement = cssSelector.selectElement(dummyElement); + assertNotNull(resultElement); + } + + @Test + public void testSelectList() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + List result = cssSelector.selectList(dummyElement); + assertEquals(1, result.size()); + assertEquals("[
\n Hello World!\n
]", result.toString()); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java new file mode 100644 index 000000000..24d87647c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class OrSelectorTest { + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + String expectedResult = "[\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + ",
\n" + + " Item 1\n" + + "
,
\n" + + " Item 2\n" + + "
]"; + List selectors = new ArrayList(); + selectors.add(new CssSelector("head")); + selectors.add(new XpathSelector("//div[@class='item1']")); + selectors.add(new XpathSelector("//div[@class='item2']")); + OrSelector orSelector = new OrSelector(selectors); + List result = orSelector.selectList(htmlContent); + assertEquals(expectedResult, result.toString()); + } +} From 28ac8bf9c433b492fca5f241fa205674285ad87d Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Thu, 28 Mar 2024 13:45:12 -0300 Subject: [PATCH 196/257] Refactored Code to Resolve Implementation Code Smells (#1151) * Initial Commit * Assignment 1 Submission * Resolving Implementation Smells --- .../downloader/HttpUriRequestConverter.java | 4 +- .../webmagic/model/PageModelExtractor.java | 104 ++++++++++-------- 2 files changed, 60 insertions(+), 48 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 4baaf4a4a..168467866 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -42,7 +42,9 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); - authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY); + UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()); + authState.update(proxyAuthScheme, proxyCredentials); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 1e25a46c0..d8947ded6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -234,63 +234,23 @@ private Object processSingle(Page page, String html, boolean isRaw) { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { if (fieldExtractor.isMulti()) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } + List value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { return null; } if (fieldExtractor.getObjectFormatter() != null) { - List converted = convert(value, fieldExtractor.getObjectFormatter()); + List converted = convertMultiValue(value, fieldExtractor.getObjectFormatter()); setField(o, fieldExtractor, converted); } else { setField(o, fieldExtractor, value); } } else { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } + String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw); if (value == null && fieldExtractor.isNotNull()) { return null; } if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convert(value, fieldExtractor.getObjectFormatter()); + Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter()); if (converted == null && fieldExtractor.isNotNull()) { return null; } @@ -313,7 +273,57 @@ private Object processSingle(Page page, String html, boolean isRaw) { return o; } - private Object convert(String value, ObjectFormatter objectFormatter) { + private List getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { + List value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + case RawText: + value = fieldExtractor.getSelector().selectList(page.getRawText()); + break; + default: + value = fieldExtractor.getSelector().selectList(html); + } + return value; + } + + private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { + String value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + case RawText: + value = fieldExtractor.getSelector().select(page.getRawText()); + break; + default: + value = fieldExtractor.getSelector().select(html); + } + return value; + } + + private Object convertSingleValue(String value, ObjectFormatter objectFormatter) { try { Object format = objectFormatter.format(value); logger.debug("String {} is converted to {}", value, format); @@ -324,10 +334,10 @@ private Object convert(String value, ObjectFormatter objectFormatter) { return null; } - private List convert(List values, ObjectFormatter objectFormatter) { + private List convertMultiValue(List values, ObjectFormatter objectFormatter) { List objects = new ArrayList(); for (String value : values) { - Object converted = convert(value, objectFormatter); + Object converted = convertSingleValue(value, objectFormatter); if (converted != null) { objects.add(converted); } From 9b9f173c1c356d2f2c9ca1c33339e459f37501c5 Mon Sep 17 00:00:00 2001 From: ayushi250317 <157420261+ayushi250317@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:26:41 -0300 Subject: [PATCH 197/257] Refactored Code to increase maintainability (#1152) * Initial Commit * Assignment 1 Submission * Resolving Implementation Smells * Refactoring Code to increase maintainability --- .../java/us/codecraft/webmagic/Spider.java | 72 ++++------------ .../codecraft/webmagic/SpiderScheduler.java | 59 +++++++++++++ .../codecraft/webmagic/selector/HtmlNode.java | 1 - .../webmagic/selector/PlainText.java | 5 -- .../webmagic/selector/Selectable.java | 8 -- .../model/formatter/BasicClassDetector.java | 85 +++++++++++++++++++ .../model/formatter/BasicTypeFormatter.java | 31 +++---- 7 files changed, 174 insertions(+), 87 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9f9201ee3..11a671f7a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,11 +9,8 @@ import java.util.List; import java.util.UUID; import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; @@ -75,9 +72,9 @@ public class Spider implements Runnable, Task { protected Site site; protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - + + protected SpiderScheduler scheduler; + protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; @@ -100,10 +97,6 @@ public class Spider implements Runnable, Task { protected boolean destroyWhenExit = true; - private ReentrantLock newUrlLock = new ReentrantLock(); - - private Condition newUrlCondition = newUrlLock.newCondition(); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); @@ -131,6 +124,7 @@ public static Spider create(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** @@ -186,15 +180,15 @@ public Spider scheduler(Scheduler scheduler) { /** * set scheduler for Spider * - * @param scheduler scheduler + * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ - public Spider setScheduler(Scheduler scheduler) { + public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - Scheduler oldScheduler = this.scheduler; - this.scheduler = scheduler; + SpiderScheduler oldScheduler = this.scheduler; + scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { @@ -213,7 +207,7 @@ public Spider setScheduler(Scheduler scheduler) { * @deprecated */ @Deprecated - public Spider pipeline(Pipeline pipeline) { + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -264,7 +258,7 @@ public Spider clearPipeline() { * @deprecated */ @Deprecated - public Spider downloader(Downloader downloader) { + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -333,10 +327,10 @@ public void run() { } } else { // wait until new url added, - if (waitNewUrl()) { - //if interrupted + if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { + // if interrupted break; - } + } continue; } } @@ -353,7 +347,7 @@ public void run() { logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); - signalNewUrl(); + scheduler.signalNewUrl(); } } }); @@ -536,7 +530,7 @@ public Spider addUrl(String... urls) { for (String url : urls) { addRequest(new Request(url)); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } @@ -588,42 +582,10 @@ public Spider addRequest(Request... requests) { for (Request request : requests) { addRequest(request); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } - /** - * - * @return isInterrupted - */ - private boolean waitNewUrl() { - // now there may not be any thread live - newUrlLock.lock(); - try { - //double check,unnecessary, unless very fast concurrent - if (threadPool.getThreadAlive() == 0) { - return false; - } - //wait for amount of time - newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); - return false; - } catch (InterruptedException e) { - // logger.warn("waitNewUrl - interrupted, error {}", e); - return true; - } finally { - newUrlLock.unlock(); - } - } - - private void signalNewUrl() { - try { - newUrlLock.lock(); - newUrlCondition.signalAll(); - } finally { - newUrlLock.unlock(); - } - } - public void start() { runAsync(); } @@ -799,7 +761,7 @@ public Date getStartTime() { } public Scheduler getScheduler() { - return scheduler; + return scheduler.getScheduler(); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java new file mode 100644 index 000000000..1005bac88 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.thread.CountableThreadPool; + +public class SpiderScheduler { + private Scheduler scheduler; + private final ReentrantLock newUrlLock = new ReentrantLock(); + private final Condition newUrlCondition = newUrlLock.newCondition(); + + public SpiderScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Scheduler getScheduler() { + return scheduler; + } + + public void setScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Request poll(Spider spider) { + return scheduler.poll(spider); + } + + public void push(Request request, Spider spider) { + scheduler.push(request, spider); + } + + public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { + newUrlLock.lock(); + try { + if (threadPool.getThreadAlive() == 0) { + return false; + } + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; + } catch (InterruptedException e) { + return true; + } finally { + newUrlLock.unlock(); + } + } + + public void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index c063b4825..85ff5fa69 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -26,7 +26,6 @@ protected List getElements() { return elements; } - @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c78f6791b..18258e9a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -42,11 +42,6 @@ public Selectable xpath(String xpath) { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } - @Override - public Selectable smartContent() { - throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); - } - @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9412cfce4..a4d5fdb94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -51,14 +51,6 @@ public interface Selectable { * @return new Selectable after extract */ public Selectable css(String selector, String attrName); - - /** - * select smart content with ReadAbility algorithm - * - * @return content - */ - public Selectable smartContent(); - /** * select all links * diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java new file mode 100644 index 000000000..f03b8864a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.model.formatter; + +public interface BasicClassDetector { + Class detectBasicClass(Class type); +} + +class IntegerClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } + return null; + } +} + +class LongClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } + return null; + } +} + +class DoubleClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } + return null; + } +} + +class FloatClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } + return null; + } +} + +class ShortClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } + return null; + } +} + +class CharacterClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } + return null; + } +} + +class ByteClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } + return null; + } +} + +class BooleanClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return null; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a845..2d4d85b0a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -24,28 +24,24 @@ public T format(String raw) throws Exception { } protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); + public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), + new LongClassDetector(), + new FloatClassDetector(), + new DoubleClassDetector(), + new ShortClassDetector(), + new ByteClassDetector(), + new BooleanClassDetector(), + new CharacterClassDetector()); public static Class detectBasicClass(Class type) { - if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { - return Integer.class; - } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { - return Long.class; - } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { - return Double.class; - } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { - return Float.class; - } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { - return Short.class; - } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { - return Character.class; - } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { - return Byte.class; - } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { - return Boolean.class; + for (BasicClassDetector detector : basicClassDetector) { + Class detectedClass = detector.detectBasicClass(type); + if (detectedClass != null) { + return detectedClass; + } } return type; } @@ -146,5 +142,4 @@ public Class clazz() { } } - } From f051d978e2f329de8f30455e6ab658789e328f1c Mon Sep 17 00:00:00 2001 From: Parthgajera056 <149322319+Parthgajera056@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:28:02 -0300 Subject: [PATCH 198/257] Refactored code for increased optimization. (#1139) * refactoring by decompose conditional technique * refactoring by introduction explaining variable technique * refactoring by rename method/variable technique * refactoring by introducing explaining variable technique * Added Extract class refactoring to increase maintainablilty * Refactoring using replace conditional with polymorphism --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++++-- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ++++++++++++++++ .../codecraft/webmagic/selector/HtmlNode.java | 63 +++---------------- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 +++++++++++++++++ 7 files changed, 150 insertions(+), 74 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9a..dc87ece87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,18 +169,25 @@ public void addTargetRequests(Iterable requests, long priority) { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { - return; + boolean isBlankUrl = StringUtils.isBlank(url); + boolean isHashSymbol = url.equals("#"); + boolean isJavaScript = url.startsWith("javascript:"); + + if (isBlankUrl || isHashSymbol || isJavaScript) { + return; // Invalid URL, so no further processing is needed. } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request req = new Request(canonicalizedUrl); - if(priority > 0) { - req.setPriority(priority); + Request request = new Request(canonicalizedUrl); + + if (priority > 0) { + request.setPriority(priority); } - targetRequests.add(req); + + targetRequests.add(request); } + /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 167a5e1c6..f32a4eba8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,13 +40,14 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; + private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(100); + connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b30785..23606d86a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public void setEncoding(String encoding) { this.encoding = encoding; } - public static HttpRequestBody json(String json, String encoding) { + public static HttpRequestBody createJsonRequestBody(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java new file mode 100644 index 000000000..10873c710 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +public class ElementsUtil { + HtmlNode htmlNode = new HtmlNode(); + public Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = htmlNode.getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + public Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa69..32a8b976e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,19 +33,22 @@ public Selectable smartContent() { @Override public Selectable links() { - return selectElements(new LinksSelector()); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { + ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return selectElements(xpathSelector); + return elementsUtil.selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - return selectElements((BaseElementSelector) selector); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -55,64 +58,18 @@ public Selectable select(Selector selector) { return selectList(selector); } - /** - * select elements - * - * @param elementSelector elementSelector - * @return result - */ - protected Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - private Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } - @Override public Selectable $(String selector) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index bbc48ddae..5596cfc7f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,26 +76,27 @@ public Selector getSelector() { } private Selector compileSelector() { + SelectorFactory factory; switch (expressionType) { case Css: - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } + factory = new CssSelectorFactory(); + break; case XPath: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); + break; case Regex: - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } + factory = new RegexSelectorFactory(); + break; case JsonPath: - return new JsonPathSelector(expressionValue); + factory = new JsonPathSelectorFactory(); + break; default: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); // Default to XPath } + + SelectorCompiler selectorCompiler = new SelectorCompiler(factory); + Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); + return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java new file mode 100644 index 000000000..7bca4ba7a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; +public interface SelectorFactory { + Selector compileSelector(String expressionValue, String[] expressionParams); +} + +class CssSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + } +} + +class XPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return xpath(expressionValue); + } +} + +class RegexSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + } +} + +class JsonPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return new JsonPathSelector(expressionValue); + } +} + +class SelectorCompiler { + private final SelectorFactory selectorFactory; + + public SelectorCompiler(SelectorFactory selectorFactory) { + this.selectorFactory = selectorFactory; + } + + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return selectorFactory.compileSelector(expressionValue, expressionParams); + } +} \ No newline at end of file From 31548deb93b91b9550a3bfe31aad85d2747a78b8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 30 Mar 2024 14:37:55 +0800 Subject: [PATCH 199/257] Revert "Refactored code for increased optimization. (#1139)" (#1153) This reverts commit f051d978e2f329de8f30455e6ab658789e328f1c. --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++---- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ---------------- .../codecraft/webmagic/selector/HtmlNode.java | 63 ++++++++++++++++--- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 ----------------- 7 files changed, 74 insertions(+), 150 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index dc87ece87..b4c161a9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,25 +169,18 @@ public void addTargetRequests(Iterable requests, long priority) { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - boolean isBlankUrl = StringUtils.isBlank(url); - boolean isHashSymbol = url.equals("#"); - boolean isJavaScript = url.startsWith("javascript:"); - - if (isBlankUrl || isHashSymbol || isJavaScript) { - return; // Invalid URL, so no further processing is needed. + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request request = new Request(canonicalizedUrl); - - if (priority > 0) { - request.setPriority(priority); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } - - targetRequests.add(request); + targetRequests.add(req); } - /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index f32a4eba8..167a5e1c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,14 +40,13 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; - private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); + connectionManager.setDefaultMaxPerRoute(100); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 23606d86a..7d3b30785 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public void setEncoding(String encoding) { this.encoding = encoding; } - public static HttpRequestBody createJsonRequestBody(String json, String encoding) { + public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java deleted file mode 100644 index 10873c710..000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java +++ /dev/null @@ -1,53 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.ArrayList; -import java.util.List; -import java.util.ListIterator; - -public class ElementsUtil { - HtmlNode htmlNode = new HtmlNode(); - public Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = htmlNode.getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - public Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 32a8b976e..85ff5fa69 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,22 +33,19 @@ public Selectable smartContent() { @Override public Selectable links() { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements(new LinksSelector()); + return selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { - ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return elementsUtil.selectElements(xpathSelector); + return selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements((BaseElementSelector) selector); + return selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -58,18 +55,64 @@ public Selectable select(Selector selector) { return selectList(selector); } + /** + * select elements + * + * @param elementSelector elementSelector + * @return result + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + private Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } + @Override public Selectable $(String selector) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index 5596cfc7f..bbc48ddae 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,27 +76,26 @@ public Selector getSelector() { } private Selector compileSelector() { - SelectorFactory factory; switch (expressionType) { case Css: - factory = new CssSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } case XPath: - factory = new XPathSelectorFactory(); - break; + return xpath(expressionValue); case Regex: - factory = new RegexSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } case JsonPath: - factory = new JsonPathSelectorFactory(); - break; + return new JsonPathSelector(expressionValue); default: - factory = new XPathSelectorFactory(); // Default to XPath + return xpath(expressionValue); } - - SelectorCompiler selectorCompiler = new SelectorCompiler(factory); - Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); - return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java deleted file mode 100644 index 7bca4ba7a..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java +++ /dev/null @@ -1,57 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.selector.JsonPathSelector; -import us.codecraft.webmagic.selector.Selector; - -import static us.codecraft.webmagic.selector.Selectors.*; -public interface SelectorFactory { - Selector compileSelector(String expressionValue, String[] expressionParams); -} - -class CssSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } - } -} - -class XPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return xpath(expressionValue); - } -} - -class RegexSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } - } -} - -class JsonPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return new JsonPathSelector(expressionValue); - } -} - -class SelectorCompiler { - private final SelectorFactory selectorFactory; - - public SelectorCompiler(SelectorFactory selectorFactory) { - this.selectorFactory = selectorFactory; - } - - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return selectorFactory.compileSelector(expressionValue, expressionParams); - } -} \ No newline at end of file From 0ceaf14882b87fe8606386e16df3ba701e2ad547 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 01:00:47 +0800 Subject: [PATCH 200/257] Bump version number from 0.10.1-SNAPSHOT to 1.0.0-SNAPSHOT for Java version updating from 1.8 to 11, refs #1134. --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index c90394a30..46993a962 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index a6eff4063..98e513c01 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index f9a2f50c8..e1e650276 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index e68385967..2ffedf291 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index ad7fae4ce..8b50671aa 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index bff1de3f6..7530e0a91 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index c81c5613b..02d440017 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 8381c0275..92a11795a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1-SNAPSHOT + 1.0.0-SNAPSHOT 4.0.0 From 2c730eb978191befca63f5a805e88317f13e4470 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 01:14:36 +0800 Subject: [PATCH 201/257] Update Java version from 1.8 to 11, refs #1134. --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 46993a962..fa7ad3f78 100644 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,8 @@ UTF-8 UTF-8 - 1.8 - 1.8 + 11 + 11 3.23.1 1.5.0 4.4 From 4ebf48f6e3bf0a7057650d0f6c7045a699e9be25 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 3 Apr 2024 18:26:01 +0800 Subject: [PATCH 202/257] Replace log4j 1.x with log4j 2.x, refs #534. --- pom.xml | 42 +++++++++++-------- webmagic-core/pom.xml | 6 --- webmagic-core/src/main/resources/log4j.xml | 21 ---------- webmagic-core/src/test/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ .../src/main/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ webmagic-samples/src/main/resources/log4j.xml | 26 ------------ .../src/main/resources/log4j2.xml | 19 +++++++++ webmagic-scripts/pom.xml | 12 ++++-- .../webmagic/scripts/ScriptConsole.java | 8 ++-- webmagic-scripts/src/main/resources/log4j.xml | 21 ---------- webmagic-scripts/src/test/resouces/log4j.xml | 21 ---------- .../src/test/resources/log4j2-test.xml | 16 +++++++ 15 files changed, 105 insertions(+), 182 deletions(-) delete mode 100644 webmagic-core/src/main/resources/log4j.xml delete mode 100644 webmagic-core/src/test/resources/log4j.xml create mode 100644 webmagic-core/src/test/resources/log4j2-test.xml delete mode 100644 webmagic-extension/src/main/resources/log4j.xml delete mode 100644 webmagic-extension/src/test/resources/log4j.xml create mode 100644 webmagic-extension/src/test/resources/log4j2-test.xml delete mode 100644 webmagic-samples/src/main/resources/log4j.xml create mode 100644 webmagic-samples/src/main/resources/log4j2.xml delete mode 100755 webmagic-scripts/src/main/resources/log4j.xml delete mode 100755 webmagic-scripts/src/test/resouces/log4j.xml create mode 100644 webmagic-scripts/src/test/resources/log4j2-test.xml diff --git a/pom.xml b/pom.xml index fa7ad3f78..36d060577 100644 --- a/pom.xml +++ b/pom.xml @@ -25,11 +25,11 @@ 2.8.0 4.13.2 2.7.3 - 1.2.17 + 2.23.1 2.0.2-beta 1.3.0 1.2.0 - 11.4 + 12.4 4.14.1 2.0.4 4.0.0.RELEASE @@ -77,6 +77,19 @@ webmagic-coverage + + + org.apache.logging.log4j + log4j-core + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + test + + + @@ -101,6 +114,16 @@ httpcore ${httpcore.version} + + org.apache.logging.log4j + log4j-core + ${log4j2.version} + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j2.version} + com.google.guava guava @@ -116,11 +139,6 @@ slf4j-api ${slf4j.version} - - org.slf4j - slf4j-log4j12 - ${slf4j.version} - us.codecraft xsoup @@ -143,11 +161,6 @@ - - log4j - log4j - ${log4j.version} - org.assertj assertj-core @@ -274,11 +287,6 @@ org.apache.maven.plugins maven-jar-plugin - - - log4j.xml - - org.apache.maven.plugins diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 98e513c01..37f1d0071 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -45,12 +45,6 @@ mockito-all - - org.slf4j - slf4j-log4j12 - true - - org.apache.commons commons-collections4 diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j2-test.xml b/webmagic-core/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-core/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-extension/src/main/resources/log4j.xml b/webmagic-extension/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j.xml b/webmagic-extension/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-extension/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml deleted file mode 100644 index a6630f813..000000000 --- a/webmagic-samples/src/main/resources/log4j.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml new file mode 100644 index 000000000..f3bad53d8 --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j2.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 02d440017..243eb829f 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -13,6 +13,14 @@ + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + org.jruby jruby @@ -40,10 +48,6 @@ webmagic-core ${project.version} - - org.slf4j - slf4j-log4j12 - ${project.groupId} webmagic-extension diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 0423e58e1..2ccfe7f4e 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; @@ -166,7 +168,7 @@ private static Params readOptions(CommandLine commandLine) { } private static void configLogger(String value) { - Logger rootLogger = Logger.getRootLogger(); + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); if ("debug".equalsIgnoreCase(value)) { rootLogger.setLevel(Level.DEBUG); } else if ("info".equalsIgnoreCase(value)) { diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml deleted file mode 100755 index 474269cb1..000000000 --- a/webmagic-scripts/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml deleted file mode 100755 index 1f64d8dad..000000000 --- a/webmagic-scripts/src/test/resouces/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resources/log4j2-test.xml b/webmagic-scripts/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..e2fab6602 --- /dev/null +++ b/webmagic-scripts/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + From ed7429c29322a2755299e801a978f259bf69495c Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 3 Apr 2024 19:37:56 +0800 Subject: [PATCH 203/257] Rename webmagic-parent to webmagic. --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 4 ++-- webmagic-selenium/pom.xml | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pom.xml b/pom.xml index 36d060577..3b8169798 100644 --- a/pom.xml +++ b/pom.xml @@ -35,8 +35,8 @@ 4.0.0.RELEASE 0.3.5 - webmagic-parent - webmagic-parent + webmagic + webmagic A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 37f1d0071..877124fc3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -2,7 +2,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e1e650276..c17309c87 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -7,7 +7,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 2ffedf291..a234a4f7a 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -2,7 +2,7 @@ us.codecraft - webmagic-parent + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8b50671aa..9de8bcb4c 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 7530e0a91..28b921093 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 243eb829f..7a294e18c 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 @@ -94,4 +94,4 @@ - \ No newline at end of file + diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 92a11795a..87de28eee 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,8 +1,8 @@ - webmagic-parent us.codecraft + webmagic 1.0.0-SNAPSHOT 4.0.0 From 383bea32f6ba5338c65a244d49293d3a34038318 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:44:15 +0800 Subject: [PATCH 204/257] Bump com.jayway.jsonpath:json-path from 2.8.0 to 2.9.0 (#1154) Bumps [com.jayway.jsonpath:json-path](https://github.com/jayway/JsonPath) from 2.8.0 to 2.9.0. - [Release notes](https://github.com/jayway/JsonPath/releases) - [Changelog](https://github.com/json-path/JsonPath/blob/master/changelog.md) - [Commits](https://github.com/jayway/JsonPath/compare/json-path-2.8.0...json-path-2.9.0) --- updated-dependencies: - dependency-name: com.jayway.jsonpath:json-path dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3b8169798..96bf09ae2 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 4.4.15 3.7.1 9.3.9.0 - 2.8.0 + 2.9.0 4.13.2 2.7.3 2.23.1 From f10fabcb5830c305cb53bd886fde5393a23a224c Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 3 Apr 2024 20:21:02 +0800 Subject: [PATCH 205/257] Update .gitignore, with merging Maven.gitignore & Global/Eclipse.gitignore in github/gitignore. --- .gitignore | 82 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 0175dbaad..3a839a5f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,77 @@ -target -*.iml -out/ -.idea -.classpath +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties +# https://github.com/takari/maven-wrapper#usage-without-binary-jar +.mvn/wrapper/maven-wrapper.jar + +# Eclipse m2e generated files +# Eclipse Core .project -.settings/ +# JDT-specific (Eclipse Java Development Tools) +.classpath +.metadata bin/ -.myeclipse +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project From 05e5eefc7d9e7dd8fd8b85cb297b2f5e30f56e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 15:51:08 +0200 Subject: [PATCH 206/257] Refactor of processSingle in PageModelExtractor (#1155) --- webmagic-extension/pom.xml | 6 + .../codecraft/webmagic/model/Extractor.java | 24 +-- .../webmagic/model/FieldExtractor.java | 40 +---- .../webmagic/model/PageModelExtractor.java | 154 ++---------------- .../webmagic/model/fields/MultipleField.java | 42 +++++ .../webmagic/model/fields/PageField.java | 31 ++++ .../webmagic/model/fields/SingleField.java | 28 ++++ .../model/selections/MultipleSelection.java | 36 ++++ .../webmagic/model/selections/Selection.java | 9 + .../model/selections/SingleSelection.java | 33 ++++ .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 12 files changed, 217 insertions(+), 186 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java mode change 100755 => 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a234a4f7a..8d2c07003 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -10,6 +10,12 @@ webmagic-extension + + org.projectlombok + lombok + 1.18.32 + provided + redis.clients jedis diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index f1d2f84d4..d64adffd7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.model; +import lombok.Getter; +import lombok.Setter; import us.codecraft.webmagic.selector.Selector; /** @@ -7,17 +9,19 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -class Extractor { +public class Extractor { + @Getter @Setter protected Selector selector; + @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; - static enum Source {Html, Url, RawHtml, RawText} + public static enum Source {Html, Url, RawHtml, RawText} public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; @@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult this.multi = multi; } - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - boolean isNotNull() { + public boolean isNotNull() { return notNull; } - boolean isMulti() { + public boolean isMulti() { return multi; } - - void setSelector(Selector selector) { - this.selector = selector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba1332..a49ea7766 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -6,53 +6,27 @@ import java.lang.reflect.Field; import java.lang.reflect.Method; +import lombok.Getter; +import lombok.Setter; + /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ -class FieldExtractor extends Extractor { +public class FieldExtractor extends Extractor { + @Getter private final Field field; + @Getter @Setter private Method setterMethod; + @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } - - Field getField() { - return field; - } - - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - void setSetterMethod(Method setterMethod) { - this.setterMethod = setterMethod; - } - - Method getSetterMethod() { - return setterMethod; - } - - boolean isNotNull() { - return notNull; - } - - ObjectFormatter getObjectFormatter() { - return objectFormatter; - } - - void setObjectFormatter(ObjectFormatter objectFormatter) { - this.objectFormatter = objectFormatter; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index d8947ded6..de71717fd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -3,17 +3,21 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; +import us.codecraft.webmagic.model.selections.MultipleSelection; +import us.codecraft.webmagic.model.selections.Selection; +import us.codecraft.webmagic.model.selections.SingleSelection; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -29,14 +33,19 @@ */ class PageModelExtractor { + @Getter private List targetUrlPatterns = new ArrayList(); + @Getter private Selector targetUrlRegionSelector; + @Getter private List helpUrlPatterns = new ArrayList(); + @Getter private Selector helpUrlRegionSelector; + @Getter private Class clazz; private List fieldExtractors; @@ -233,145 +242,16 @@ private Object processSingle(Page page, String html, boolean isRaw) { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.isMulti()) { - List value=getMultiValueFromSource(page, fieldExtractor, html, isRaw); - if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - List converted = convertMultiValue(value, fieldExtractor.getObjectFormatter()); - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } else { - String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw); - if (value == null && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter()); - if (converted == null && fieldExtractor.isNotNull()) { - return null; - } - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } + Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); + PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + if (!field.operation(o, fieldExtractor, logger)) + return null; } - if (AfterExtractor.class.isAssignableFrom(clazz)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); - } - } catch (InstantiationException e) { - logger.error("extract fail", e); - } catch (IllegalAccessException e) { - logger.error("extract fail", e); - } catch (InvocationTargetException e) { + } catch (Exception e) { logger.error("extract fail", e); } return o; } - - private List getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } - return value; - } - - private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } - return value; - } - - private Object convertSingleValue(String value, ObjectFormatter objectFormatter) { - try { - Object format = objectFormatter.format(value); - logger.debug("String {} is converted to {}", value, format); - return format; - } catch (Exception e) { - logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); - } - return null; - } - - private List convertMultiValue(List values, ObjectFormatter objectFormatter) { - List objects = new ArrayList(); - for (String value : values) { - Object converted = convertSingleValue(value, objectFormatter); - if (converted != null) { - objects.add(converted); - } - } - return objects; - } - - private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value == null) { - return; - } - if (fieldExtractor.getSetterMethod() != null) { - fieldExtractor.getSetterMethod().invoke(o, value); - } - fieldExtractor.getField().set(o, value); - } - - Class getClazz() { - return clazz; - } - - List getTargetUrlPatterns() { - return targetUrlPatterns; - } - - List getHelpUrlPatterns() { - return helpUrlPatterns; - } - - Selector getTargetUrlRegionSelector() { - return targetUrlRegionSelector; - } - - Selector getHelpUrlRegionSelector() { - return helpUrlRegionSelector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java new file mode 100644 index 000000000..4a4bf38a8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public class MultipleField extends PageField { + @Getter + private List fieldNames; + + public MultipleField(List fieldNames) { + this.fieldNames = fieldNames; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) + return false; + if (fieldExtractor.getObjectFormatter() != null) { + List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); + setField(o, fieldExtractor, converted); + } + else + setField(o, fieldExtractor, this.fieldNames); + return true; + } + + private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { + List objects = new ArrayList<>(); + for (String value : values) { + Object converted = this.convert(value, objectFormatter, logger); + if (converted != null) + objects.add(converted); + } + return objects; + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java new file mode 100644 index 000000000..ad4428335 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public abstract class PageField { + public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; + + protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { + try { + Object format = objectFormatter.format(value); + logger.debug("String {} is converted to {}", value, format); + return format; + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value != null) { + if (fieldExtractor.getSetterMethod() != null) + fieldExtractor.getSetterMethod().invoke(o, value); + fieldExtractor.getField().set(o, value); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java new file mode 100644 index 000000000..136a1c56e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; + +public class SingleField extends PageField { + @Getter + private String fieldName; + + public SingleField(String fieldName) { + this.fieldName = fieldName; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); + if (converted == null && fieldExtractor.isNotNull()) + return false; + setField(o, fieldExtractor, converted); + } else + setField(o, fieldExtractor, this.fieldName); + return true; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java new file mode 100644 index 000000000..d49f9c576 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.model.selections; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; + +public class MultipleSelection implements Selection { + public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + List fieldsName; + switch (fieldExtractor.getSource()) { + case RawHtml: + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + fieldsName = fieldExtractor.getSelector().selectList(html); + break; + case Url: + fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + case RawText: + fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); + break; + default: + fieldsName = fieldExtractor.getSelector().selectList(html); + } + if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { + return null; + } + return new MultipleField(fieldsName); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java new file mode 100644 index 000000000..e70ab9d9b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java @@ -0,0 +1,9 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.PageField; + +public interface Selection { + public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java new file mode 100644 index 000000000..a4c1fe452 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.model.selections; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SingleSelection implements Selection { + public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + String field; + switch (fieldExtractor.getSource()) { + case RawHtml: + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + break; + case Html: + if (isRaw) + field = page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + field = fieldExtractor.getSelector().select(html); + break; + case Url: + field = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + case RawText: + field = fieldExtractor.getSelector().select(page.getRawText()); + break; + default: + field = fieldExtractor.getSelector().select(html); + } + if (field == null && fieldExtractor.isNotNull()) + return null; + return new SingleField(field); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java old mode 100755 new mode 100644 From 2df7dca8711d226dd98bd0afefa4531a6d1e44b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Fri, 5 Apr 2024 16:50:21 +0200 Subject: [PATCH 207/257] Changed refactor of processSingle again, this one is a better version (#1157) * Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot better --- .../codecraft/webmagic/model/Extractor.java | 6 +- .../webmagic/model/FieldExtractor.java | 1 + .../webmagic/model/PageModelExtractor.java | 36 +++++----- .../model/selections/MultipleSelection.java | 36 ---------- .../webmagic/model/selections/Selection.java | 9 --- .../model/selections/SingleSelection.java | 33 --------- .../webmagic/model/sources/Source.java | 68 +++++++++++++++++++ .../model/sources/SourceTextExtractor.java | 17 +++++ 8 files changed, 105 insertions(+), 101 deletions(-) delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index d64adffd7..673447586 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -2,6 +2,8 @@ import lombok.Getter; import lombok.Setter; + +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** @@ -20,9 +22,7 @@ public class Extractor { protected final boolean notNull; protected final boolean multi; - - public static enum Source {Html, Url, RawHtml, RawText} - + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a49ea7766..d4cb5937f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index de71717fd..751aafe76 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -9,9 +9,9 @@ import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; -import us.codecraft.webmagic.model.selections.MultipleSelection; -import us.codecraft.webmagic.model.selections.Selection; -import us.codecraft.webmagic.model.selections.SingleSelection; +import us.codecraft.webmagic.model.sources.Source; +import us.codecraft.webmagic.model.sources.SourceTextExtractor; +import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -95,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, - new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -121,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -136,26 +136,23 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - ExtractBy.Source source0 = extractBy.source(); - if (extractBy.type()== ExtractBy.Type.JsonPath){ - source0 = RawText; - } - FieldExtractor.Source source = null; - switch (source0){ + ExtractBy.Source extractSource = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath) + extractSource = RawText; + Source source = null; + switch (extractSource) { case RawText: - source = FieldExtractor.Source.RawText; + source = new RawText(); break; case RawHtml: - source = FieldExtractor.Source.RawHtml; + source = new RawHtml(); break; case SelectedHtml: - source =FieldExtractor.Source.Html; + source = new SelectedHtml(); break; default: - source =FieldExtractor.Source.Html; - + source = new SelectedHtml(); } - fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); @@ -202,7 +199,7 @@ private void initClassExtractors() { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } @@ -242,8 +239,7 @@ private Object processSingle(Page page, String html, boolean isRaw) { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection(); - PageField field = selection.extractField(page, html, isRaw, fieldExtractor); + PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); if (!field.operation(o, fieldExtractor, logger)) return null; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java deleted file mode 100644 index d49f9c576..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/MultipleSelection.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import java.util.List; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.MultipleField; - -public class MultipleSelection implements Selection { - public MultipleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - List fieldsName; - switch (fieldExtractor.getSource()) { - case RawHtml: - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - fieldsName = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - else - fieldsName = fieldExtractor.getSelector().selectList(html); - break; - case Url: - fieldsName = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - fieldsName = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - fieldsName = fieldExtractor.getSelector().selectList(html); - } - if ((fieldsName == null || fieldsName.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - return new MultipleField(fieldsName); - } -} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java deleted file mode 100644 index e70ab9d9b..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/Selection.java +++ /dev/null @@ -1,9 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.PageField; - -public interface Selection { - public PageField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java deleted file mode 100644 index a4c1fe452..000000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/selections/SingleSelection.java +++ /dev/null @@ -1,33 +0,0 @@ -package us.codecraft.webmagic.model.selections; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.model.FieldExtractor; -import us.codecraft.webmagic.model.fields.SingleField; - -public class SingleSelection implements Selection { - public SingleField extractField(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { - String field; - switch (fieldExtractor.getSource()) { - case RawHtml: - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) - field = page.getHtml().selectDocument(fieldExtractor.getSelector()); - else - field = fieldExtractor.getSelector().select(html); - break; - case Url: - field = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - field = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - field = fieldExtractor.getSelector().select(html); - } - if (field == null && fieldExtractor.isNotNull()) - return null; - return new SingleField(field); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java new file mode 100644 index 000000000..146827220 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.model.sources; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; + +public interface Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + + public class RawHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } + } + + public class SelectedHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().selectList(html); + } + } + + public class Url implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getUrl().toString()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getUrl().toString()); + } + } + + public class RawText implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getRawText()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getRawText()); + } + } + + public class DefaultSource implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(html); + } + } +} + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java new file mode 100644 index 000000000..1e572695f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.model.sources; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; +import us.codecraft.webmagic.model.fields.PageField; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SourceTextExtractor { + public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + Source source = fieldExtractor.getSource(); + if (fieldExtractor.isMulti()) + return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); + else + return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); + } +} \ No newline at end of file From d8321baf560e4d5742909c33d8f1dacee590fea0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Gibier?= Date: Sat, 6 Apr 2024 01:55:46 +0200 Subject: [PATCH 208/257] Refactored and implement of a template method pattern for logger config in webmagic-scripts (#1158) * Refactor of processSingle in PageModelExtractor * Changed my refactor of processSingle, this one is a lot better * Changed my refactor of processSingle, this one is a lot better * add lombok for getters and setters * Refactored and implement of a template method pattern for logger config --- webmagic-scripts/pom.xml | 6 + .../us/codecraft/webmagic/scripts/Params.java | 47 +++++++ .../webmagic/scripts/ScriptConsole.java | 117 +----------------- .../webmagic/scripts/ScriptEnginePool.java | 6 +- .../webmagic/scripts/ScriptProcessor.java | 35 +----- .../scripts/ScriptProcessorBuilder.java | 7 +- .../scripts/config/CommandLineOption.java | 82 ++++++++++++ .../webmagic/scripts/config/ConfigLogger.java | 34 +++++ .../webmagic/scripts/languages/JRuby.java | 26 ++++ .../scripts/languages/Javascript.java | 16 +++ .../webmagic/scripts/languages/Jython.java | 27 ++++ .../scripts/{ => languages}/Language.java | 29 +++-- .../webmagic/scripts/ScriptProcessorTest.java | 10 +- 13 files changed, 274 insertions(+), 168 deletions(-) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/CommandLineOption.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java rename webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/{ => languages}/Language.java (51%) mode change 100755 => 100644 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 7a294e18c..aa5a47981 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -53,6 +53,12 @@ webmagic-extension ${project.version} + + org.projectlombok + lombok + 1.18.32 + provided + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java new file mode 100644 index 000000000..873176e6e --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.scripts; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import lombok.Getter; +import lombok.Setter; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Language; +import us.codecraft.webmagic.utils.WMCollections; + +public class Params { + @Getter + Language language = new Javascript(); + + @Getter @Setter + String scriptFileName; + + @Getter @Setter + List urls; + + @Getter @Setter + int thread = 1; + + @Getter @Setter + int sleepTime = 1000; + + private static Map> alias; + + public Params() { + alias = new HashMap>(); + alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 2ccfe7f4e..c60b3ec3d 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,90 +1,21 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Set; /** - * @author code4crafter@gmail.com + * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { - - private static class Params { - Language language = Language.JavaScript; - String scriptFileName; - List urls; - int thread = 1; - int sleepTime = 1000; - private static Map> alias = new HashMap>(); - - static { - alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); - } - - public void setLanguagefromArg(String arg) { - for (Map.Entry> languageSetEntry : alias.entrySet()) { - if (languageSetEntry.getValue().contains(arg)) { - this.language = languageSetEntry.getKey(); - return; - } - } - } - - private Language getLanguage() { - return language; - } - - private void setLanguage(Language language) { - this.language = language; - } - - private String getScriptFileName() { - return scriptFileName; - } - - private void setScriptFileName(String scriptFileName) { - this.scriptFileName = scriptFileName; - } - - private List getUrls() { - return urls; - } - - private void setUrls(List urls) { - this.urls = urls; - } - - private int getThread() { - return thread; - } - - private void setThread(int thread) { - this.thread = thread; - } - - private int getSleepTime() { - return sleepTime; - } - - private void setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - } - } - public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); @@ -142,45 +73,9 @@ private static void exit() { private static Params readOptions(CommandLine commandLine) { Params params = new Params(); - if (commandLine.hasOption("l")) { - String language = commandLine.getOptionValue("l"); - params.setLanguagefromArg(language); - } - if (commandLine.hasOption("f")) { - String scriptFilename = commandLine.getOptionValue("f"); - params.setScriptFileName(scriptFilename); - } else { - exit(); - } - if (commandLine.hasOption("s")) { - Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); - params.setSleepTime(sleepTime); - } - if (commandLine.hasOption("t")) { - Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); - params.setThread(thread); - } - if (commandLine.hasOption("g")) { - configLogger(commandLine.getOptionValue("g")); - } - params.setUrls(commandLine.getArgList()); + List options = CommandLineOption.getAllOptions(); + for (CommandLineOption option : options) + option.addParamOptionIfInCommandLine(params, commandLine); return params; } - - private static void configLogger(String value) { - Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); - if ("debug".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.DEBUG); - } else if ("info".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.INFO); - } else if ("warn".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.WARN); - } else if ("trace".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.TRACE); - } else if ("off".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.OFF); - } else if ("error".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.ERROR); - } - } -} +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index d1e5d7fe8..bdfbbaedb 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -2,6 +2,9 @@ import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; + +import us.codecraft.webmagic.scripts.languages.Language; + import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -11,14 +14,11 @@ */ public class ScriptEnginePool { - private final int size; - private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { - this.size = size; this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { + return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); + } +} + +class OptionL extends CommandLineOption { + public OptionL() { + super('l'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } +} + +class OptionF extends CommandLineOption { + public OptionF() { + super('f'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } +} + +class OptionS extends CommandLineOption { + public OptionS() { + super('s'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } +} + +class OptionT extends CommandLineOption { + public OptionT() { + super('t'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } +} + +class OptionG extends CommandLineOption { + public OptionG() { + super('g'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + ConfigLogger.configLogger(commandLine.getOptionValue("g")); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java new file mode 100644 index 000000000..9e81ea6c7 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scripts.config; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + +public class ConfigLogger { + /** + * Log the config parameter. If the counter is less than the number of available + * options then it means that the user entered an option + * + * @param value The config string + */ + public static void configLogger(String value) { + List> options = List.of( + Pair.of("debug", Level.DEBUG), + Pair.of("info", Level.INFO), + Pair.of("warn", Level.WARN), + Pair.of("trace", Level.TRACE), + Pair.of("off", Level.OFF), + Pair.of("error", Level.ERROR)); + Pair option = options.get(0); + int i = 1; + while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) + option = options.get(i++); + if (i < options.size()) { + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(option.getRight()); + } + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java new file mode 100644 index 000000000..b3a3209a5 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.jruby.RubyHash; + +import us.codecraft.webmagic.Page; + +public class JRuby extends Language { + public JRuby() { + super("jruby","ruby/defines.rb",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java new file mode 100644 index 000000000..b0f7b647a --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import us.codecraft.webmagic.Page; + +public class Javascript extends Language { + public Javascript() { + super("javascript","js/defines.js",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java new file mode 100644 index 000000000..9124d2dbb --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.python.core.PyDictionary; + +import us.codecraft.webmagic.Page; + +public class Jython extends Language { + public Jython() { + super("jython","python/defines.py",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java old mode 100755 new mode 100644 similarity index 51% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java index 2f9d22d57..44e6ba0a0 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java @@ -1,15 +1,18 @@ -package us.codecraft.webmagic.scripts; +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; +import us.codecraft.webmagic.Page; /** - * @author code4crafter@gmail.com + * @author FrancoisGib */ -public enum Language { - - JavaScript("javascript","js/defines.js",""), - - JRuby("jruby","ruby/defines.rb",""), - - Jython("jython","python/defines.py",""); +public abstract class Language { + public Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } private String engineName; @@ -17,12 +20,6 @@ public enum Language { private String gatherFile; - Language(String engineName, String defineFile, String gatherFile) { - this.engineName = engineName; - this.defineFile = defineFile; - this.gatherFile = gatherFile; - } - public String getEngineName() { return engineName; } @@ -34,4 +31,6 @@ public String getDefineFile() { public String getGatherFile() { return gatherFile; } + + public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index ffeb9c993..b4c28521f 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -2,7 +2,11 @@ import org.junit.Ignore; import org.junit.Test; + import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com @@ -13,14 +17,14 @@ public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @@ -28,7 +32,7 @@ public void testRubyProcessor() { @Test public void testPythonProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } From b7e0d360ec68ab5c91e28e2d95e3cdb04670a211 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 22:04:48 +0800 Subject: [PATCH 209/257] Upgrade junit from 4.13.2 to 5.10.2. --- pom.xml | 49 ++++++++++++++++++++++++++++++++------ webmagic-core/pom.xml | 5 ---- webmagic-extension/pom.xml | 4 ---- webmagic-samples/pom.xml | 4 ---- webmagic-saxon/pom.xml | 4 ---- webmagic-scripts/pom.xml | 5 ---- webmagic-selenium/pom.xml | 4 ---- 7 files changed, 42 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index 96bf09ae2..f08b3b543 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,8 @@ 3.7.1 9.3.9.0 2.9.0 - 4.13.2 + 5.10.2 + 1.10.2 2.7.3 2.23.1 2.0.2-beta @@ -88,16 +89,30 @@ log4j-slf4j2-impl test + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.platform + junit-platform-launcher + test + + + org.junit.platform + junit-platform-runner + test + - - junit - junit - ${junit.version} - test - org.mockito mockito-all @@ -134,6 +149,26 @@ json-path ${json-path.version} + + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + + + org.junit.vintage + junit-vintage-engine + ${junit.version} + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + + + org.junit.platform + junit-platform-runner + ${junit.platform.version} + org.slf4j slf4j-api diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 877124fc3..9f2eda76c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -15,11 +15,6 @@ httpclient - - junit - junit - - org.apache.commons commons-lang3 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 8d2c07003..b72922317 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -35,10 +35,6 @@ webmagic-core ${project.version} - - junit - junit - diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 9de8bcb4c..41a4b7b45 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -20,10 +20,6 @@ webmagic-extension ${project.version} - - junit - junit - org.mapdb mapdb diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 28b921093..930f5b32c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -23,10 +23,6 @@ net.sf.saxon Saxon-HE - - junit - junit - diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index aa5a47981..676ffd1a0 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -38,11 +38,6 @@ commons-cli commons-cli - - junit - junit - test - ${project.groupId} webmagic-core diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 87de28eee..86b65daf9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -23,10 +23,6 @@ com.github.detro phantomjsdriver - - junit - junit - From dba166830625fa69ae9817ec6409e22a24a83a03 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:23:32 +0800 Subject: [PATCH 210/257] Add tests to test the equals & hashCode of Proxy. --- .../codecraft/webmagic/proxy/ProxyTest.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 8e4c82026..cff25b0ec 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -8,18 +8,18 @@ import java.util.List; import org.apache.http.HttpHost; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; /** * @author yxssfxwzy@sina.com May 30, 2014 - * + * */ public class ProxyTest { private static List httpProxyList = new ArrayList(); - @BeforeClass + @BeforeAll public static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; @@ -48,7 +48,7 @@ public void run() { } @Test - public void testCreate() { + void testCreate() { Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); @@ -86,7 +86,15 @@ public void testCreate() { } @Test - public void testToString() { + void testEqualsHashCode() { + var proxy0 = new Proxy("::1", 1080); + var proxy1 = new Proxy("::1", 1080); + assertEquals(proxy0, proxy1); + assertEquals(proxy0.hashCode(), proxy1.hashCode()); + } + + @Test + void testToString() { assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); From 5196a56ccf7aee374b44a02a1e9a414496431938 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:30:43 +0800 Subject: [PATCH 211/257] Format code. --- pom.xml | 7 +++- webmagic-core/pom.xml | 7 +++- webmagic-coverage/pom.xml | 6 ++- webmagic-extension/pom.xml | 7 +++- webmagic-samples/pom.xml | 7 +++- webmagic-saxon/pom.xml | 7 +++- webmagic-scripts/pom.xml | 7 +++- webmagic-selenium/pom.xml | 75 ++++++++++++++++++++------------------ 8 files changed, 80 insertions(+), 43 deletions(-) diff --git a/pom.xml b/pom.xml index f08b3b543..4ec241db7 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9f2eda76c..f6530b467 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c17309c87..c53a30c28 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -1,7 +1,9 @@ - 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b72922317..9290c18fc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 41a4b7b45..3eff105e1 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 930f5b32c..b528d8ae6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 676ffd1a0..86e36c7da 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,5 +1,10 @@ - + us.codecraft webmagic diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 86b65daf9..831cfecf8 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,41 +1,46 @@ - - - us.codecraft - webmagic - 1.0.0-SNAPSHOT - - 4.0.0 + + + us.codecraft + webmagic + 1.0.0-SNAPSHOT + + 4.0.0 - webmagic-selenium + webmagic-selenium - - - org.seleniumhq.selenium - selenium-java - - - ${project.groupId} - webmagic-core - ${project.version} - - - com.github.detro - phantomjsdriver - - + + + org.seleniumhq.selenium + selenium-java + + + ${project.groupId} + webmagic-core + ${project.version} + + + com.github.detro + phantomjsdriver + + - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - + + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + true + + + + From 9ab342c3a782db8ad95e8e3ce1cff2cb4d8b158d Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:31:21 +0800 Subject: [PATCH 212/257] Remove public modifiers from junit5 test methods. --- .../src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index cff25b0ec..61fc6ab8b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -15,12 +15,12 @@ * @author yxssfxwzy@sina.com May 30, 2014 * */ -public class ProxyTest { +class ProxyTest { private static List httpProxyList = new ArrayList(); @BeforeAll - public static void before() { + static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; From 5344db0106b80568b1b4bee26af8f9dcce2f521f Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 21 Apr 2024 23:35:26 +0800 Subject: [PATCH 213/257] Upgrade jacoco-maven-plugin from 0.8.8 to 0.8.12. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4ec241db7..459930adc 100644 --- a/pom.xml +++ b/pom.xml @@ -468,7 +468,7 @@ org.jacoco jacoco-maven-plugin - 0.8.8 + 0.8.12 com.amashchenko.maven.plugin From e34b495625766b66de6ce954f3a38c9efc170027 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 22 Apr 2024 00:17:58 +0800 Subject: [PATCH 214/257] Upgrade maven-suirefire-plugin from 3.0.0-M7 to 3.2.5. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 459930adc..d629db370 100644 --- a/pom.xml +++ b/pom.xml @@ -453,7 +453,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.0.0-M7 + 3.2.5 org.apache.maven.plugins From a5144350bddea084e5e88bbac2a71642f224e0ff Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 22 Apr 2024 00:45:54 +0800 Subject: [PATCH 215/257] Upgrade maven plugins to latest versions. --- pom.xml | 55 +++++++++++++++++++++++++++--------------- webmagic-saxon/pom.xml | 17 +++---------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index d629db370..47f8c3546 100644 --- a/pom.xml +++ b/pom.xml @@ -275,7 +275,6 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.1.0 enforce-maven @@ -285,7 +284,7 @@ - 3.5.0 + 3.6.3 @@ -331,7 +330,6 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 attach-sources @@ -344,9 +342,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.4.1 - UTF-8 WebMagic ${project.version} en_US @@ -373,7 +369,6 @@ org.apache.maven.plugins maven-release-plugin - 3.0.0-M6 org.jacoco @@ -408,47 +403,67 @@ org.apache.maven.plugins maven-clean-plugin - 3.2.0 + 3.3.2 org.apache.maven.plugins maven-compiler-plugin - 3.10.1 + 3.13.0 org.apache.maven.plugins maven-deploy-plugin - 3.0.0 + 3.1.1 + + + org.apache.maven.plugins + maven-enforcer-plugin + 3.4.1 org.apache.maven.plugins maven-install-plugin - 3.0.1 + 3.1.1 org.apache.maven.plugins maven-jar-plugin - 3.3.0 + 3.4.1 + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.6.3 org.apache.maven.plugins maven-jxr-plugin - 3.3.0 + 3.3.2 org.apache.maven.plugins maven-pmd-plugin - 3.19.0 + 3.21.2 + + + org.apache.maven.plugins + maven-release-plugin + 3.0.1 org.apache.maven.plugins maven-resources-plugin - 3.3.0 + 3.3.1 org.apache.maven.plugins maven-site-plugin - 4.0.0-M3 + 4.0.0-M13 + + + org.apache.maven.plugins + maven-source-plugin + 3.3.0 org.apache.maven.plugins @@ -458,7 +473,7 @@ org.apache.maven.plugins maven-surefire-report-plugin - 3.0.0-M7 + 3.2.5 org.codehaus.mojo @@ -473,12 +488,12 @@ com.amashchenko.maven.plugin gitflow-maven-plugin - 1.18.0 + 1.21.0 com.github.spotbugs spotbugs-maven-plugin - 4.7.2.0 + 4.8.4.0 @@ -525,7 +540,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.0 package @@ -553,7 +568,7 @@ org.apache.maven.plugins maven-gpg-plugin - 3.0.1 + 3.2.4 verify diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b528d8ae6..2c5bc9597 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -14,6 +14,10 @@ webmagic-saxon + + true + + ${project.groupId} @@ -30,17 +34,4 @@ - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - - From 76ef0332f1a29361e77458338e29db10506a48a7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:06:47 +0800 Subject: [PATCH 216/257] Update versions for hotfix --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 2fb56e7be..15722ec32 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.0 + 0.10.1 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 021a83f3e..dcf2e4c49 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 4109c49fc..3e441ecd9 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b47ae3614..d99bf8264 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.0 + 0.10.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 906606fba..bae376ef0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 4a2b358d0..9709a04fc 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 92914655a..2939b824c 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 5c2e50b2a..cd1213046 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.0 + 0.10.1 4.0.0 From a0ff4a2d2e3834e68ca399c0621b15bc798b4f83 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:09:15 +0800 Subject: [PATCH 217/257] Fix log message. --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 80e7b72c9..05d6e1a2b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -85,13 +85,13 @@ public Page download(Request request, Task task) { page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); - logger.info("downloading page success {}", request.getUrl()); + logger.info("Download page success: {}", request.getUrl()); return page; } catch (IOException e) { onError(page, task, e); - logger.info("download page {} error", request.getUrl(), e); + logger.info("Download page error: {}", request.getUrl(), e); return page; } finally { From 5afe3e7b81b905386f622d07b3fbe1a276da7a71 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:21:45 +0800 Subject: [PATCH 218/257] Revert "fix: pom.xml to reduce vulnerabilities (#1134)" This reverts commit 19288e9c11551e7b0e3a2533183942405f0fa521. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 15722ec32..eecbdb9ee 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ 1.3.0 1.2.0 11.4 - 4.14.1 + 3.141.59 2.0.4 4.0.0.RELEASE 0.3.5 From 78740a4e283bdbe8d5f237d32f7b5a35eb634d9d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:26:38 +0800 Subject: [PATCH 219/257] Fix NPE. --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 05d6e1a2b..31eebc720 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -7,6 +7,7 @@ import java.util.Optional; import org.apache.commons.io.IOUtils; +import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; @@ -111,7 +112,8 @@ public void setThread(int thread) { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + HttpEntity entity = httpResponse.getEntity(); + byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];; String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); From 884f51ba3bf336cc79b1487ca4faef644fe4bd76 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 20:38:03 +0800 Subject: [PATCH 220/257] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 47f8c3546..a0b38ce15 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f6530b467..2dad0a0a9 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c53a30c28..3d42cd618 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 9290c18fc..76eeec0fe 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 3eff105e1..d5849aecf 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2c5bc9597..5cb3b473c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 86e36c7da..14d92f078 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 831cfecf8..22239c3ae 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 0.10.1 4.0.0 From bda51537b6e3ce4e64c7b993dd35f40931ebc4e2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 21:32:04 +0800 Subject: [PATCH 221/257] Update versions for hotfix --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index eecbdb9ee..0eaf2f761 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.1 + 0.10.2 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index dcf2e4c49..8803b6ede 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 3e441ecd9..8754c4782 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.1 + 0.10.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index d99bf8264..03ef006d5 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index bae376ef0..0f53566ac 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 9709a04fc..4b72dbdda 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 2939b824c..aa11e1c6f 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index cd1213046..e53dbee59 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.1 + 0.10.2 4.0.0 From 0dbfaf7284b8419d0a30bff839ecf6b14bde6d52 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 21:34:59 +0800 Subject: [PATCH 222/257] Remove useless log. --- .../webmagic/downloader/HttpClientDownloader.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 31eebc720..d93fc03c0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -12,8 +12,6 @@ import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; @@ -33,8 +31,6 @@ */ public class HttpClientDownloader extends AbstractDownloader { - private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); @@ -84,16 +80,10 @@ public Page download(Request request, Task task) { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(page, task); - logger.info("Download page success: {}", request.getUrl()); - return page; } catch (IOException e) { - onError(page, task, e); - logger.info("Download page error: {}", request.getUrl(), e); - return page; } finally { if (httpResponse != null) { @@ -138,7 +128,6 @@ private String getHtmlCharset(String contentType, byte[] contentBytes, Task task String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); - logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } From a81c4e7627853623c8e1661f7fd1c7e47e8321bf Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 21:46:35 +0800 Subject: [PATCH 223/257] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index a0b38ce15..e4d5607c4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.10.1 + 0.10.2 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 2dad0a0a9..13afbf7e1 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 3d42cd618..d928636d3 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 76eeec0fe..18ce75e8b 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 035ec7226..00d810c99 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 5cb3b473c..743327fc5 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 14d92f078..d69164b54 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 22239c3ae..d40ebd2db 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.1 + 0.10.2 4.0.0 From 54aef0f0e032176aa5c081ec35af7e75d5e63057 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 23:34:15 +0800 Subject: [PATCH 224/257] Update versions for hotfix --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 0eaf2f761..df314b7ce 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.10.2 + 0.10.3 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 8803b6ede..20a942791 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 8754c4782..46b66f328 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.10.2 + 0.10.3 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 03ef006d5..dd72ccbd9 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 0f53566ac..3f191c7b7 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 4b72dbdda..cbd1621ca 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index aa11e1c6f..568c8d0b8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index e53dbee59..a7a9179bc 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.10.2 + 0.10.3 4.0.0 From 462c60fef2e4f3c97f2f55415d76ce035a6478fc Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 23:37:46 +0800 Subject: [PATCH 225/257] Fix for entity is null. --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d93fc03c0..39deecc73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -103,8 +103,8 @@ public void setThread(int thread) { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { HttpEntity entity = httpResponse.getEntity(); - byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];; - String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); + byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; + String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()) { From 38f240c42e341da0a11ce2c04f35cba7f654e142 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 23 Apr 2024 23:39:23 +0800 Subject: [PATCH 226/257] Update to hotfix version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index e4d5607c4..1b2aabb17 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.10.2 + 0.10.3 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 13afbf7e1..3c9ca0078 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index d928636d3..0c09d4047 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 18ce75e8b..bcf473be1 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 00d810c99..eb2ed69cd 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 743327fc5..fd993e09c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d69164b54..e31d57218 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d40ebd2db..3f8aa6951 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.2 + 0.10.3 4.0.0 From 16a4fe3e28af963a9ce61bda14d2497bf914191e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 17 May 2024 13:17:13 +0800 Subject: [PATCH 227/257] Use oxerr-parent instead. --- pom.xml | 283 +----------------- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- .../ConfigurablePageProcessorTest.java | 1 - .../model/ModelPageProcessorTest.java | 1 - webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 10 files changed, 14 insertions(+), 285 deletions(-) diff --git a/pom.xml b/pom.xml index 1b2aabb17..eee06779e 100644 --- a/pom.xml +++ b/pom.xml @@ -5,9 +5,14 @@ xsi:schemaLocation=" http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - us.codecraft - 0.10.3 4.0.0 + + org.oxerr + oxerr-parent + 2.1.0 + + us.codecraft + 1.0.0-SNAPSHOT pom UTF-8 @@ -272,73 +277,6 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - - - enforce-maven - - enforce - - - - - 3.6.3 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - - - - - - - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - org.apache.maven.plugins maven-javadoc-plugin @@ -366,10 +304,6 @@ - - org.apache.maven.plugins - maven-release-plugin - org.jacoco jacoco-maven-plugin @@ -398,209 +332,6 @@ - - - - org.apache.maven.plugins - maven-clean-plugin - 3.3.2 - - - org.apache.maven.plugins - maven-compiler-plugin - 3.13.0 - - - org.apache.maven.plugins - maven-deploy-plugin - 3.1.1 - - - org.apache.maven.plugins - maven-enforcer-plugin - 3.4.1 - - - org.apache.maven.plugins - maven-install-plugin - 3.1.1 - - - org.apache.maven.plugins - maven-jar-plugin - 3.4.1 - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.6.3 - - - org.apache.maven.plugins - maven-jxr-plugin - 3.3.2 - - - org.apache.maven.plugins - maven-pmd-plugin - 3.21.2 - - - org.apache.maven.plugins - maven-release-plugin - 3.0.1 - - - org.apache.maven.plugins - maven-resources-plugin - 3.3.1 - - - org.apache.maven.plugins - maven-site-plugin - 4.0.0-M13 - - - org.apache.maven.plugins - maven-source-plugin - 3.3.0 - - - org.apache.maven.plugins - maven-surefire-plugin - 3.2.5 - - - org.apache.maven.plugins - maven-surefire-report-plugin - 3.2.5 - - - org.codehaus.mojo - taglist-maven-plugin - 3.0.0 - - - org.jacoco - jacoco-maven-plugin - 0.8.12 - - - com.amashchenko.maven.plugin - gitflow-maven-plugin - 1.21.0 - - - com.github.spotbugs - spotbugs-maven-plugin - 4.8.4.0 - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - none - - - - org.apache.maven.plugins - maven-jxr-plugin - - - org.apache.maven.plugins - maven-pmd-plugin - - - org.apache.maven.plugins - maven-surefire-report-plugin - - - org.codehaus.mojo - taglist-maven-plugin - - - com.github.spotbugs - spotbugs-maven-plugin - - - - - - - release - - - - - org.apache.maven.plugins - maven-source-plugin - 3.3.0 - - - package - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.4.1 - - - package - - jar - - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 3.2.4 - - - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.13 - true - - sonatype-nexus-staging - https://oss.sonatype.org/ - true - - - - - - - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 3c9ca0078..f6530b467 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 0c09d4047..c53a30c28 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index bcf473be1..9290c18fc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java index 63c40d295..c2081dbf3 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -13,7 +13,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class ConfigurablePageProcessorTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 627fa6e84..1014a45f5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -12,7 +12,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-4 */ public class ModelPageProcessorTest { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index eb2ed69cd..f1da70165 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index fd993e09c..2c5bc9597 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index e31d57218..86e36c7da 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 3f8aa6951..831cfecf8 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 0.10.3 + 1.0.0-SNAPSHOT 4.0.0 From 7d2d2244b3f5c830f1e9258f28ab669e3596eaa2 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 21 May 2024 12:55:05 +0800 Subject: [PATCH 228/257] Upgrade oxerr-parent from 2.1.0 to 2.2.1. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index eee06779e..333cf41d6 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.oxerr oxerr-parent - 2.1.0 + 2.2.1 us.codecraft 1.0.0-SNAPSHOT From d2aebc60a7cb72fbd8107c844983e24543e106e4 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 00:57:28 +0800 Subject: [PATCH 229/257] Make getCharset to support null parameter. --- .../src/main/java/us/codecraft/webmagic/utils/UrlUtils.java | 4 ++++ .../test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index c61483a39..ea317c405 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -116,6 +116,10 @@ public static List convertToUrls(Collection requests) { private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { + if (contentType == null) { + return null; + } + Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 6afdeefe4..38c8295bb 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import static org.junit.Assert.assertNull; + import org.junit.Assert; import org.junit.Test; @@ -43,5 +45,9 @@ public void testGetDomain(){ Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); } + @Test + public void testGetCharset() { + assertNull(UrlUtils.getCharset(null)); + } } From 5c43e361188fb23f36b1edce9845e10f9386c993 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 00:59:30 +0800 Subject: [PATCH 230/257] Make sure the contentType of detectCharset could be null. --- .../webmagic/utils/CharsetUtilsTest.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java new file mode 100644 index 000000000..987a6f77a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.IOException; + +import org.junit.jupiter.api.Test; + +class CharsetUtilsTest { + + @Test + void testDetectCharset() throws IOException { + assertNull(CharsetUtils.detectCharset(null, new byte[0])); + } + +} From 49a5efff46ec604578d6cb98015a8700bdf1fa21 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 4 Jun 2024 01:02:45 +0800 Subject: [PATCH 231/257] Add a private constructor to hide the implicit public one. --- .../main/java/us/codecraft/webmagic/utils/CharsetUtils.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a466..63bb4c110 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -21,6 +21,10 @@ public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + private CharsetUtils() { + throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!"); + } + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset From 4d0cdb011fc42251c2476bb5f44379d588ae65f5 Mon Sep 17 00:00:00 2001 From: Niu_XZ Date: Mon, 17 Jun 2024 17:27:28 +0800 Subject: [PATCH 232/257] =?UTF-8?q?stopWhenComplete=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E5=8A=A8=E6=80=81=E4=BF=AE=E6=94=B9=E5=AE=8C=E6=88=90?= =?UTF-8?q?=E6=97=B6=E5=81=9C=E6=AD=A2=E6=96=B9=E6=B3=95=E3=80=82=20(#1169?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: niuxiaozu --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 11a671f7a..a35af70af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -85,7 +85,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = true; + protected volatile boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -598,6 +598,13 @@ public void stop() { } } + /** + * Stop when all tasks in the queue are completed and all worker threads are also completed + */ + public void stopWhenComplete(){ + this.exitWhenComplete = true; + } + /** * start with more than one threads * From 3e9cd9b5c35a6acf05868cca78caf68f1aec6a40 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Fri, 5 Jul 2024 00:20:28 +0800 Subject: [PATCH 233/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 333cf41d6..b96c9a829 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.0-SNAPSHOT + 1.0.0 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f6530b467..6e1d3c896 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c53a30c28..19cdc33d7 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 9290c18fc..15f94cf5e 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index f1da70165..921161362 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2c5bc9597..2530bd81d 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 86e36c7da..3c03aaf8e 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 831cfecf8..a0dc13861 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0-SNAPSHOT + 1.0.0 4.0.0 From 25c2d95e961ccb686f5286a1aa603d511ad93b55 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Fri, 5 Jul 2024 00:27:54 +0800 Subject: [PATCH 234/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index b96c9a829..696839f2f 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.0 + 1.0.1-SNAPSHOT pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6e1d3c896..4299d4b3b 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 19cdc33d7..e179e2a37 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 15f94cf5e..c76263a05 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 921161362..d52f78304 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2530bd81d..b7682bf7d 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3c03aaf8e..131ad5ef2 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a0dc13861..f84c97997 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 From 2c135dadce1fcb102084ca222da4d9ade0e3b541 Mon Sep 17 00:00:00 2001 From: xiezcGitHub <765150816@qq.com> Date: Tue, 6 Aug 2024 19:29:41 +0800 Subject: [PATCH 235/257] =?UTF-8?q?#1172=20=E9=97=AE=E9=A2=98=E7=9A=84?= =?UTF-8?q?=E8=A7=A3=E5=86=B3=20(#1173)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: xiezhicheng --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a35af70af..e47a61f22 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -187,7 +187,7 @@ public Spider scheduler(Scheduler scheduler) { */ public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - SpiderScheduler oldScheduler = this.scheduler; + Scheduler oldScheduler = scheduler.getScheduler(); scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; From 15ec80fcf1b8327b7bc780409aeab03f198384b9 Mon Sep 17 00:00:00 2001 From: xiezcGitHub <765150816@qq.com> Date: Mon, 19 Aug 2024 13:05:28 +0800 Subject: [PATCH 236/257] =?UTF-8?q?FileCacheQueueScheduler=E4=BD=BF?= =?UTF-8?q?=E7=94=A8BloomFilter=E8=BF=9B=E8=A1=8C=E5=8E=BB=E9=87=8D=20(#11?= =?UTF-8?q?76)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: xiezc --- .../java/us/codecraft/webmagic/Spider.java | 1 - .../scheduler/FileCacheQueueScheduler.java | 103 +++++------------- 2 files changed, 29 insertions(+), 75 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index e47a61f22..a71166421 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -458,7 +458,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index fec3c1db9..0dabdd954 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,29 +1,13 @@ package us.codecraft.webmagic.scheduler; -import java.io.BufferedReader; -import java.io.Closeable; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; - import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; /** @@ -32,7 +16,7 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable { +public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable { private String filePath = System.getProperty("java.io.tmpdir"); @@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement private BlockingQueue queue; - private Set urls; - private ScheduledExecutorService flushThreadPool; public FileCacheQueueScheduler(String filePath) { @@ -83,36 +65,13 @@ private void init(Task task) { } private void initDuplicateRemover() { - setDuplicateRemover( - new DuplicateRemover() { - @Override - public boolean isDuplicate(Request request, Task task) { - if (!inited.get()) { - init(task); - } - return !urls.add(request.getUrl()); - } - - @Override - public void resetDuplicateCheck(Task task) { - urls.clear(); - } - - @Override - public int getTotalRequestsCount(Task task) { - return urls.size(); - } - }); + BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode()); + setDuplicateRemover(bloomFilterDuplicateRemover); } private void initFlushThread() { - flushThreadPool = Executors.newScheduledThreadPool(1); - flushThreadPool.scheduleAtFixedRate(new Runnable() { - @Override - public void run() { - flush(); - } - }, 10, 10, TimeUnit.SECONDS); + flushThreadPool = Executors.newScheduledThreadPool(1); + flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS); } private void initWriter() { @@ -127,7 +86,6 @@ private void initWriter() { private void readFile() { try { queue = new LinkedBlockingQueue(); - urls = new LinkedHashSet(); readCursorFile(); readUrlFile(); // initDuplicateRemover(); @@ -140,46 +98,43 @@ private void readFile() { } private void readUrlFile() throws IOException { - String line; - BufferedReader fileUrlReader = null; - try { - fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); + try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) { + String line; int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { - urls.add(line.trim()); + Request request = deserializeRequest(line); + this.getDuplicateRemover().isDuplicate(request, null); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(deserializeRequest(line)); + queue.add(request); } } - } finally { - if (fileUrlReader != null) { - IOUtils.closeQuietly(fileUrlReader); - } } } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = null; - try { - fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); + String fileName = getFileName(fileCursor); + try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) { String line; + String lastLine = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { - cursor = new AtomicInteger(NumberUtils.toInt(line)); + line = line.trim(); + if (!line.isEmpty()) { + lastLine = line; + } } - } finally { - if (fileCursorReader != null) { - IOUtils.closeQuietly(fileCursorReader); + if (lastLine != null) { + cursor.set(NumberUtils.toInt(line)); } } } - + public void close() throws IOException { - flushThreadPool.shutdown(); - fileUrlWriter.close(); - fileCursorWriter.close(); - } + flushThreadPool.shutdown(); + fileUrlWriter.close(); + fileCursorWriter.close(); + } private String getFileName(String filename) { return filePath + task.getUUID() + filename; From 19f60bf34055f52086ee9f8e4c88a537e7e25a8b Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Sat, 5 Oct 2024 10:42:57 +0800 Subject: [PATCH 237/257] fix: pom.xml to reduce vulnerabilities (#1178) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMMONSIO-8161190 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b96c9a829..3e11396e0 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 3.23.1 1.5.0 4.4 - 2.11.0 + 2.14.0 3.12.0 2.0.19.graal 3.0.13 From 541ced9eeaa55d14d2f9496b741d08a9ea42cb9a Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 9 Oct 2024 23:36:02 +0800 Subject: [PATCH 238/257] Change the default status code from 200 to 0 & downloadSuccess from true to false, for Page. --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 5 ++--- .../codecraft/webmagic/downloader/MockGithubDownloader.java | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9a..e8c75ccf1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; @@ -43,9 +42,9 @@ public class Page { private Map> headers; - private int statusCode = HttpConstant.StatusCode.CODE_200; + private int statusCode; - private boolean downloadSuccess = true; + private boolean downloadSuccess; private byte[] bytes; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 91e3698cf..bb18aa2c5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -938,6 +938,7 @@ public Page download(Request request, Task task) { Page page = new Page(); page.setRawText(html); page.setStatusCode(200); + page.setDownloadSuccess(true); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; From 50026ff937a5af26179ee4daab8ab93d541e38ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:38:56 +0800 Subject: [PATCH 239/257] Bump commons-io:commons-io from 2.11.0 to 2.14.0 (#1179) Bumps commons-io:commons-io from 2.11.0 to 2.14.0. --- updated-dependencies: - dependency-name: commons-io:commons-io dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 696839f2f..8fd0dbf6b 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 3.23.1 1.5.0 4.4 - 2.11.0 + 2.14.0 3.12.0 2.0.19.graal 3.0.13 From 6eab7a4155163f8b4a0dbbd2f69b8ce452bef500 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 26 Oct 2024 01:02:00 +0800 Subject: [PATCH 240/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 8fd0dbf6b..d0abd3568 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1
us.codecraft - 1.0.1-SNAPSHOT + 1.0.1 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4299d4b3b..52cd7ba2c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e179e2a37..98db3f826 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index c76263a05..1fe18e066 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index d52f78304..76105d330 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b7682bf7d..c206d21a2 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 131ad5ef2..123ac6699 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index f84c97997..d09deef50 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 From 6ed83769e0a10fc6be02ba3b3371a88cf6be34ae Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 26 Oct 2024 01:37:09 +0800 Subject: [PATCH 241/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index d0abd3568..9380a7eaa 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1
us.codecraft - 1.0.1 + 1.0.2-SNAPSHOT pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 52cd7ba2c..6e31559f2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 98db3f826..93925ab3b 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1fe18e066..b986a8e63 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 76105d330..a7d9b809d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index c206d21a2..52b60685c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 123ac6699..1d99229b0 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d09deef50..04be9c20c 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 From 7e500d7b95972d062b4442f505c24dd06ca32f0c Mon Sep 17 00:00:00 2001 From: Bob Conan Date: Fri, 22 Nov 2024 20:24:58 -0600 Subject: [PATCH 242/257] Updated README.md, fix typo(s) (#1180) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 89536c927..2af81cb22 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. +Write a class implements PageProcessor. For example, I wrote a crawler of github repository information. ```java public class GithubRepoPageProcessor implements PageProcessor { @@ -112,7 +112,7 @@ public class GithubRepo { Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) -The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) +The architecture of webmagic (referred to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) From 0a9fe8d3e03e58c96e497efdc727ce6d09684229 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 00:49:03 +0800 Subject: [PATCH 243/257] Add static methods to construct Page. --- .../main/java/us/codecraft/webmagic/Page.java | 37 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 5 +-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e8c75ccf1..18486f7a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -52,9 +52,44 @@ public class Page { private String charset; + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofSuccess(Request request) { + return new Page(request, true); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofFailure(Request request) { + return new Page(request, false); + } + public Page() { } + /** + * Constructs a {@link Page} with {@link #request} + * and {@link #downloadSuccess} specified. + * + * @param request the request. + * @param downloadSuccess the download success flag. + * @since 1.0.2 + */ + private Page(Request request, boolean downloadSuccess) { + this.request = request; + this.downloadSuccess = downloadSuccess; + } + /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. * @@ -73,7 +108,9 @@ public static Page fail() { * @param request the {@link Request}. * @return the page. * @since 0.10.0 + * @deprecated Use {@link #ofFailure(Request)} instead. */ + @Deprecated(since = "1.0.2", forRemoval = true) public static Page fail(Request request){ Page page = new Page(); page.setRequest(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 39deecc73..789448f03 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,7 +76,7 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(request); + Page page = Page.ofFailure(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); @@ -105,7 +105,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http HttpEntity entity = httpResponse.getEntity(); byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; - Page page = new Page(); + Page page = Page.ofSuccess(request); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { @@ -117,7 +117,6 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); - page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } From c20edb824645806cd02367fd3b517efacb3e44cf Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 03:31:30 +0800 Subject: [PATCH 244/257] Polish code. --- .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 789448f03..6fdae38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,13 +76,14 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.ofFailure(request); + Page page = null; try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); return page; } catch (IOException e) { + page = Page.ofFailure(request); onError(page, task, e); return page; } finally { From bf1088bd67ade34b666860a7abc1c5c61886e36e Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 04:16:05 +0800 Subject: [PATCH 245/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 9380a7eaa..af04c6917 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1
us.codecraft - 1.0.2-SNAPSHOT + 1.0.2 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6e31559f2..f436bce26 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 93925ab3b..6265abae5 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b986a8e63..a1c26d212 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a7d9b809d..2c2b34ef6 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 52b60685c..37349a419 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1d99229b0..3093284c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 04be9c20c..b47f84a31 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 From 8dc417452a156e3e69ac80dc86dc9d8eccfa206d Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 04:28:50 +0800 Subject: [PATCH 246/257] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index af04c6917..1ade6a24a 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.2 + 1.0.3-SNAPSHOT pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f436bce26..b70044853 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 6265abae5..935c4dfe4 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a1c26d212..408116b86 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 2c2b34ef6..c424f568d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 37349a419..888a673a8 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3093284c8..bca71ceac 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index b47f84a31..3b6bdc094 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3-SNAPSHOT 4.0.0 From 1cd199b160bfd24d81372897713048cfa6e55faf Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Sat, 7 Dec 2024 13:13:37 +0800 Subject: [PATCH 247/257] fix: webmagic-scripts/pom.xml to reduce vulnerabilities (#1181) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJETBRAINSKOTLIN-2393744 Co-authored-by: snyk-bot --- webmagic-scripts/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3093284c8..4b21d5e3c 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -14,7 +14,7 @@ webmagic-scripts - 1.6.0 + 2.1.0 From 9bb2417f58cc44e8cc220db7143215c0f8b64ebd Mon Sep 17 00:00:00 2001 From: zyw61483 Date: Wed, 11 Dec 2024 16:36:20 +0800 Subject: [PATCH 248/257] =?UTF-8?q?=E4=BF=AE=E6=94=B9SmartContentSelector?= =?UTF-8?q?=20threshold=E5=8F=AF=E5=AE=9A=E5=88=B6=E5=8C=96=20(#1183)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修改SmartContentSelector threshold可定制化 * 修改SmartContentSelector threshold可定制化 --------- Co-authored-by: zhaoyiwei --- .../main/java/us/codecraft/webmagic/selector/HtmlNode.java | 5 +++++ .../java/us/codecraft/webmagic/selector/Selectors.java | 4 ++++ .../codecraft/webmagic/selector/SmartContentSelector.java | 7 ++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa69..74ea718e5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -31,6 +31,11 @@ public Selectable smartContent() { return select(smartContentSelector, getSourceTexts()); } + public Selectable smartContent(int threshold) { + SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); + return select(smartContentSelector, getSourceTexts()); + } + @Override public Selectable links() { return selectElements(new LinksSelector()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 7cd68c1d6..3600896e2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() { return new SmartContentSelector(); } + public static SmartContentSelector smartContent(int threshold) { + return new SmartContentSelector(threshold); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..c8816510b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -16,9 +16,15 @@ @Experimental public class SmartContentSelector implements Selector { + private int threshold = 86; + public SmartContentSelector() { } + public SmartContentSelector(int threshold) { + this.threshold = threshold; + } + @Override public String select(String html) { html = html.replaceAll("(?is)", ""); @@ -29,7 +35,6 @@ public String select(String html) { html = html.replaceAll("(?is)<.*?>", ""); List lines; int blocksWidth =3; - int threshold =86; int start; int end; StringBuilder text = new StringBuilder(); From b1cf7ae4554e47b64b49b7e1827d731ea71a4cc8 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 18 Dec 2024 21:05:25 +0800 Subject: [PATCH 249/257] Set page status code for SeleniumDownloader. Fixes #1185 --- .../us/codecraft/webmagic/downloader/PhantomJSDownloader.java | 3 ++- .../webmagic/downloader/selenium/SeleniumDownloader.java | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 31dfca75a..01f1af9a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.*; @@ -96,7 +97,7 @@ public Page download(Request request, Task task) { page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - page.setStatusCode(200); + page.setStatusCode(HttpConstant.StatusCode.CODE_200); } onSuccess(page, task); } catch (Exception e) { diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 874f8aef7..f6d2574fb 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -14,9 +14,11 @@ import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.Closeable; import java.io.IOException; +import java.net.http.HttpRequest; import java.util.Map; /** @@ -111,6 +113,7 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); + page.setStatusCode(HttpConstant.StatusCode.CODE_200); onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); From 42a172729e87b220bb6b9e6454b2b9e4036a1a43 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 22 Dec 2024 16:48:59 +0800 Subject: [PATCH 250/257] Remove useless modifier. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 167a5e1c6..94b00cc73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -36,7 +36,7 @@ */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; From 7efc9872df972511ba365d4a310e6cb450020a6e Mon Sep 17 00:00:00 2001 From: "Jason N. White" Date: Tue, 31 Dec 2024 12:10:32 -0600 Subject: [PATCH 251/257] Update LICENSE, fix license year (#1186) Signed-off-by: JasonnnW3000 --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 0cecd8527..37d7aa900 100644 --- a/LICENSE +++ b/LICENSE @@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2013 code4craft + Copyright 2025 code4craft Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From f4a8825bee8a7932b57c6a4d966fb5e0d32dc18a Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 10 Feb 2025 16:20:21 +0800 Subject: [PATCH 252/257] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 1ade6a24a..903ac48a9 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.3-SNAPSHOT + 1.0.3 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b70044853..bad11de43 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 935c4dfe4..2b4a53460 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 408116b86..93faa4aaf 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c424f568d..50e79c73e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 888a673a8..26d1989d6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index bca71ceac..13b0516df 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 3b6bdc094..16214c61a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.3-SNAPSHOT + 1.0.3 4.0.0 From f8a6b371f5821e419fb40d4ffd6a086987cbb0d8 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Mon, 12 May 2025 08:08:05 +0800 Subject: [PATCH 253/257] fix: pom.xml to reduce vulnerabilities (#1189) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJRUBY-10074039 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 903ac48a9..07dd24442 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ 4.5.13 4.4.15 3.7.1 - 9.3.9.0 + 9.4.12.1 2.9.0 5.10.2 1.10.2 From ceec88183bce0a9f9c9fbfc0cc32f7a075e5ea06 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Fri, 18 Jul 2025 11:20:23 +0800 Subject: [PATCH 254/257] fix: pom.xml to reduce vulnerabilities (#1194) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGAPACHECOMMONS-10734078 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 07dd24442..67d4ccec8 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ 1.5.0 4.4 2.14.0 - 3.12.0 + 3.18.0 2.0.19.graal 3.0.13 32.0.0-jre From 926978c2e2acdf07e64af48d89db062382f2096a Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Sat, 23 Aug 2025 17:33:36 +0800 Subject: [PATCH 255/257] fix: pom.xml to reduce vulnerabilities (#1195) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGJRUBY-10557729 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 67d4ccec8..494dd60e0 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ 4.5.13 4.4.15 3.7.1 - 9.4.12.1 + 9.4.13.0 2.9.0 5.10.2 1.10.2 From c0bcea2175c979ff5e59605502a6f6122081f30f Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Mon, 10 Nov 2025 08:26:14 +0800 Subject: [PATCH 256/257] fix: pom.xml to reduce vulnerabilities (#1196) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-NETMINIDEV-8689573 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 494dd60e0..2d98f1ced 100644 --- a/pom.xml +++ b/pom.xml @@ -32,7 +32,7 @@ 4.4.15 3.7.1 9.4.13.0 - 2.9.0 + 2.10.0 5.10.2 1.10.2 2.7.3 From 5ab46f885c03ecea2b05cfd1841dae78ebdbb0e8 Mon Sep 17 00:00:00 2001 From: Yihua Huang Date: Sat, 20 Dec 2025 22:01:57 +0800 Subject: [PATCH 257/257] fix: pom.xml to reduce vulnerabilities (#1197) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGAPACHELOGGINGLOG4J-14532782 Co-authored-by: snyk-bot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2d98f1ced..ffb9a2e86 100644 --- a/pom.xml +++ b/pom.xml @@ -36,7 +36,7 @@ 5.10.2 1.10.2 2.7.3 - 2.23.1 + 2.25.3 2.0.2-beta 1.3.0 1.2.0