properties) {
- return object;
- }
-
-}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
deleted file mode 100644
index 3c7e6ff3c..000000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
+++ /dev/null
@@ -1,124 +0,0 @@
-package us.codecraft.webmagic.downloader;
-
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import us.codecraft.webmagic.*;
-import us.codecraft.webmagic.utils.Experimental;
-import us.codecraft.webmagic.pipeline.Pipeline;
-import us.codecraft.webmagic.processor.PageProcessor;
-import us.codecraft.webmagic.processor.SimplePageProcessor;
-import us.codecraft.webmagic.selector.Html;
-import us.codecraft.webmagic.selector.PlainText;
-import us.codecraft.webmagic.utils.FilePersistentBase;
-import us.codecraft.webmagic.utils.UrlUtils;
-
-import java.io.*;
-
-/**
- * Download file and saved to file for cache.
- *
- * @author code4crafter@gmail.com
- * @since 0.2.1
- */
-@Experimental
-public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
-
- private Downloader downloaderWhenFileMiss;
-
- private final PageProcessor pageProcessor;
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- public FileCache(String startUrl, String urlPattern) {
- this(startUrl, urlPattern, "/data/webmagic/temp/");
- }
-
- public FileCache(String startUrl, String urlPattern, String path) {
- this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
- setPath(path);
- downloaderWhenFileMiss = new HttpClientDownloader();
- }
-
- public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
- this.downloaderWhenFileMiss = downloaderWhenFileMiss;
- return this;
- }
-
- @Override
- public Page download(Request request, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
- Page page = null;
- try {
- final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
- BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
- String line = bufferedReader.readLine();
- if (line.equals("url:\t" + request.getUrl())) {
- final String html = getHtml(bufferedReader);
- page = new Page();
- page.setRequest(request);
- page.setUrl(PlainText.create(request.getUrl()));
- page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
- }
- } catch (IOException e) {
- if (e instanceof FileNotFoundException) {
- logger.info("File not exist for url " + request.getUrl());
- } else {
- logger.warn("File read error for url " + request.getUrl(), e);
- }
- }
- if (page == null) {
- page = downloadWhenMiss(request, task);
- }
- return page;
- }
-
- @Override
- public void setThread(int thread) {
-
- }
-
- private String getHtml(BufferedReader bufferedReader) throws IOException {
- String line;
- StringBuilder htmlBuilder = new StringBuilder();
- line = bufferedReader.readLine();
- line = StringUtils.removeStart(line, "html:\t");
- htmlBuilder.append(line);
- while ((line = bufferedReader.readLine()) != null) {
- htmlBuilder.append(line);
- }
- return htmlBuilder.toString();
- }
-
- private Page downloadWhenMiss(Request request, Task task) {
- Page page = null;
- if (downloaderWhenFileMiss != null) {
- page = downloaderWhenFileMiss.download(request, task);
- }
- return page;
- }
-
- @Override
- public void process(ResultItems resultItems, Task task) {
- String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
- try {
- PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
- printWriter.println("url:\t" + resultItems.getRequest().getUrl());
- printWriter.println("html:\t" + resultItems.get("html"));
- printWriter.close();
- } catch (IOException e) {
- logger.warn("write file error", e);
- }
- }
-
- @Override
- public void process(Page page) {
- pageProcessor.process(page);
- }
-
- @Override
- public Site getSite() {
- return pageProcessor.getSite();
- }
-}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
new file mode 100644
index 000000000..01f1af9a3
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
@@ -0,0 +1,128 @@
+package us.codecraft.webmagic.downloader;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
+
+import java.io.*;
+
+/**
+ * this downloader is used to download pages which need to render the javascript
+ *
+ * @author dolphineor@gmail.com
+ * @version 0.5.3
+ */
+public class PhantomJSDownloader extends AbstractDownloader {
+ private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
+ private static String crawlJsPath;
+ private static String phantomJsCommand = "phantomjs"; // default
+
+ public PhantomJSDownloader() {
+ this.initPhantomjsCrawlPath();
+ }
+
+ /**
+ * 添加新的构造函数,支持phantomjs自定义命令
+ *
+ * example:
+ * phantomjs.exe 支持windows环境
+ * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
+ * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
+ *
+ * @param phantomJsCommand phantomJsCommand
+ */
+ public PhantomJSDownloader(String phantomJsCommand) {
+ this.initPhantomjsCrawlPath();
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ }
+
+ /**
+ * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
+ *
+ * crawl.js start --
+ *
+ * var system = require('system');
+ * var url = system.args[1];
+ *
+ * var page = require('webpage').create();
+ * page.settings.loadImages = false;
+ * page.settings.resourceTimeout = 5000;
+ *
+ * page.open(url, function (status) {
+ * if (status != 'success') {
+ * console.log("HTTP request failed!");
+ * } else {
+ * console.log(page.content);
+ * }
+ *
+ * page.close();
+ * phantom.exit();
+ * });
+ *
+ * -- crawl.js end
+ *
+ * 具体项目时可以将以上js代码复制下来使用
+ *
+ * example:
+ * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+ *
+ * @param phantomJsCommand phantomJsCommand
+ * @param crawlJsPath crawlJsPath
+ */
+ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ PhantomJSDownloader.crawlJsPath = crawlJsPath;
+ }
+
+ private void initPhantomjsCrawlPath() {
+ PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ + System.getProperty("file.separator") + "crawl.js ";
+ }
+
+ @Override
+ public Page download(Request request, Task task) {
+ if (logger.isInfoEnabled()) {
+ logger.info("downloading page: " + request.getUrl());
+ }
+
+ Page page = Page.fail(request);
+ try {
+ String content = getPage(request);
+ if (!content.contains("HTTP request failed")) {
+ page.setDownloadSuccess(true);
+ page.setRawText(content);
+ page.setUrl(new PlainText(request.getUrl()));
+ page.setRequest(request);
+ page.setStatusCode(HttpConstant.StatusCode.CODE_200);
+ }
+ onSuccess(page, task);
+ } catch (Exception e) {
+ onError(page, task, e);
+ logger.warn("download page {} error", request.getUrl(), e);
+ }
+ return page;
+ }
+
+ @Override
+ public void setThread(int threadNum) {
+ // ignore
+ }
+
+ protected String getPage(Request request) throws Exception {
+ String url = request.getUrl();
+ Runtime runtime = Runtime.getRuntime();
+ Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+ InputStream is = process.getInputStream();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ StringBuilder builder = new StringBuilder();
+ String line;
+ while ((line = br.readLine()) != null) {
+ builder.append(line).append("\n");
+ }
+ return builder.toString();
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java
deleted file mode 100644
index 28d3ab052..000000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/ConfigurableBlogPageProcessor.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package us.codecraft.webmagic.example;
-
-import java.util.List;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.configurable.Inject;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class ConfigurableBlogPageProcessor implements PageProcessor {
-
- private Site site = Site.me().setDomain("my.oschina.net");
-
- @Inject("linkRegex")
- private String linkRegex;
-
- @Inject("titleXpath")
- private String titleXpath;
-
- @Inject("contentXpath")
- private String contentXpath;
-
- @Inject("tagsXpath")
- private String tagsXpath;
-
- @Override
- public void process(Page page) {
- List links = page.getHtml().links().regex(linkRegex).all();
- page.addTargetRequests(links);
- page.putField("title", page.getHtml().xpath(titleXpath).toString());
- if (page.getResultItems().get("title") == null) {
- //skip this page
- page.setSkip(true);
- }
- page.putField("content", page.getHtml().smartContent().toString());
- page.putField("tags", page.getHtml().xpath(tagsXpath).all());
- }
-
- @Override
- public Site getSite() {
- return site;
-
- }
-
- public static void main(String[] args) {
- Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
- }
-}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java
index 427cdf70f..844c775f4 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java
@@ -19,7 +19,7 @@
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
public class GithubRepo implements HasKey {
- @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
+ @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
@@ -78,4 +78,17 @@ public int getStar() {
public int getFork() {
return fork;
}
+
+ @Override
+ public String toString() {
+ return "GithubRepo{" +
+ "name='" + name + '\'' +
+ ", author='" + author + '\'' +
+ ", readme='" + readme + '\'' +
+ ", language=" + language +
+ ", star=" + star +
+ ", fork=" + fork +
+ ", url='" + url + '\'' +
+ '}';
+ }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java
index 34608fd93..4181bb9e2 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java
@@ -15,19 +15,19 @@
*/
public class GithubRepoApi implements HasKey {
- @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name")
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name", source = ExtractBy.Source.RawText)
private String name;
- @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login")
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login", source = ExtractBy.Source.RawText)
private String author;
- @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true)
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true, source = ExtractBy.Source.RawText)
private List language;
- @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count")
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count", source = ExtractBy.Source.RawText)
private int star;
- @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.homepage")
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.forks_count", source = ExtractBy.Source.RawText)
private int fork;
@ExtractByUrl
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java
new file mode 100644
index 000000000..d8bf9fbe9
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic.example;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.model.PageMapper;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.3.2
+ */
+public class GithubRepoPageMapper implements PageProcessor {
+
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+ private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class);
+
+ @Override
+ public void process(Page page) {
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
+ GithubRepo githubRepo = githubRepoPageMapper.get(page);
+ if (githubRepo == null) {
+ page.setSkip(true);
+ } else {
+ page.putField("repo", githubRepo);
+ }
+
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run();
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java
new file mode 100644
index 000000000..b759c034f
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java
@@ -0,0 +1,26 @@
+package us.codecraft.webmagic.example;
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.monitor.SpiderMonitor;
+import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor;
+import us.codecraft.webmagic.processor.example.ZhihuPageProcessor;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public class MonitorExample {
+
+ public static void main(String[] args) throws Exception {
+
+ Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
+ .addUrl("http://my.oschina.net/flashsword/blog");
+ Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
+ .addUrl("https://github.com/code4craft");
+
+ SpiderMonitor.instance().register(zhihuSpider);
+ SpiderMonitor.instance().register(githubSpider);
+ zhihuSpider.start();
+ githubSpider.start();
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
index e8ac20c48..b527ea753 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
@@ -26,11 +26,11 @@ public class OschinaBlog {
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List tags;
- @Formatter("yyyy-MM-dd HH:mm")
@ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')")
private Date date;
public static void main(String[] args) {
+ //results will be saved to "/data/webmagic/" in json format
OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
new file mode 100644
index 000000000..9406abfd2
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
@@ -0,0 +1,68 @@
+package us.codecraft.webmagic.example;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import us.codecraft.webmagic.*;
+import us.codecraft.webmagic.handler.CompositePageProcessor;
+import us.codecraft.webmagic.handler.CompositePipeline;
+import us.codecraft.webmagic.handler.PatternProcessor;
+import us.codecraft.webmagic.handler.RequestMatcher;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: Sebastian MA
+ * Date: April 04, 2014
+ * Time: 21:23
+ */
+public class PatternProcessorExample {
+
+ private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class);
+
+ public static void main(String... args) {
+
+ // define a patternProcessor which handles only "http://item.jd.com/.*"
+ PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") {
+
+ @Override
+ public RequestMatcher.MatchOther processPage(Page page) {
+ page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+ return RequestMatcher.MatchOther.YES;
+ }
+
+ @Override
+ public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
+ log.info("Extracting from repo" + resultItems.getRequest());
+ System.out.println("Repo name: "+resultItems.get("reponame"));
+ return RequestMatcher.MatchOther.YES;
+ }
+ };
+
+ PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") {
+
+ @Override
+ public RequestMatcher.MatchOther processPage(Page page) {
+ log.info("Extracting from " + page.getUrl());
+ page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
+ page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
+ page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
+ return RequestMatcher.MatchOther.YES;
+ }
+
+ @Override
+ public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
+ System.out.println("User name: "+resultItems.get("username"));
+ return RequestMatcher.MatchOther.YES;
+ }
+ };
+
+ CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(3));
+ CompositePipeline pipeline = new CompositePipeline();
+
+ pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor);
+ pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor);
+
+ Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java
new file mode 100644
index 000000000..b7a39ed93
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java
@@ -0,0 +1,57 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class CompositePageProcessor implements PageProcessor {
+
+ private Site site;
+
+ private List subPageProcessors = new ArrayList();
+
+ public CompositePageProcessor(Site site) {
+ this.site = site;
+ }
+
+ @Override
+ public void process(Page page) {
+ for (SubPageProcessor subPageProcessor : subPageProcessors) {
+ if (subPageProcessor.match(page.getRequest())) {
+ SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page);
+ if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) {
+ return;
+ }
+ }
+ }
+ }
+
+ public CompositePageProcessor setSite(Site site) {
+ this.site = site;
+ return this;
+ }
+
+ public CompositePageProcessor addSubPageProcessor(SubPageProcessor subPageProcessor) {
+ this.subPageProcessors.add(subPageProcessor);
+ return this;
+ }
+
+ public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) {
+ this.subPageProcessors = new ArrayList();
+ for (SubPageProcessor subPageProcessor : subPageProcessors) {
+ this.subPageProcessors.add(subPageProcessor);
+ }
+ return this;
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java
new file mode 100644
index 000000000..3f09eee21
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java
@@ -0,0 +1,42 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class CompositePipeline implements Pipeline {
+
+ private List subPipelines = new ArrayList();
+
+ @Override
+ public void process(ResultItems resultItems, Task task) {
+ for (SubPipeline subPipeline : subPipelines) {
+ if (subPipeline.match(resultItems.getRequest())) {
+ RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task);
+ if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) {
+ return;
+ }
+ }
+ }
+ }
+
+ public CompositePipeline addSubPipeline(SubPipeline subPipeline) {
+ this.subPipelines.add(subPipeline);
+ return this;
+ }
+
+ public CompositePipeline setSubPipeline(SubPipeline... subPipelines) {
+ this.subPipelines = new ArrayList();
+ for (SubPipeline subPipeline : subPipelines) {
+ this.subPipelines.add(subPipeline);
+ }
+ return this;
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java
new file mode 100644
index 000000000..f9ef286b2
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java
@@ -0,0 +1,13 @@
+package us.codecraft.webmagic.handler;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor {
+ /**
+ * @param pattern url pattern to handle
+ */
+ public PatternProcessor(String pattern) {
+ super(pattern);
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java
new file mode 100644
index 000000000..1be61a8f3
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.Request;
+
+import java.util.regex.Pattern;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: Sebastian MA
+ * Date: April 03, 2014
+ * Time: 10:00
+ *
+ * A PatternHandler is in charge of both page extraction and data processing by implementing
+ * its two abstract methods.
+ */
+public abstract class PatternRequestMatcher implements RequestMatcher {
+
+ /**
+ * match pattern. only matched page should be handled.
+ */
+ protected String pattern;
+
+ private Pattern patternCompiled;
+
+ /**
+ * @param pattern url pattern to handle
+ */
+ public PatternRequestMatcher(String pattern) {
+ this.pattern = pattern;
+ this.patternCompiled = Pattern.compile(pattern);
+ }
+
+ @Override
+ public boolean match(Request request) {
+ return patternCompiled.matcher(request.getUrl()).matches();
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java
new file mode 100644
index 000000000..7d351a4db
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java
@@ -0,0 +1,24 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.Request;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public interface RequestMatcher {
+
+ /**
+ * Check whether to process the page.
+ * Please DO NOT change page status in this method.
+ *
+ * @param page page
+ *
+ * @return whether matches
+ */
+ public boolean match(Request page);
+
+ public enum MatchOther {
+ YES, NO
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
new file mode 100644
index 000000000..f7baad7cc
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
@@ -0,0 +1,19 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.Page;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public interface SubPageProcessor extends RequestMatcher {
+
+ /**
+ * process the page, extract urls to fetch, extract the data and store
+ *
+ * @param page page
+ *
+ * @return whether continue to match
+ */
+ public MatchOther processPage(Page page);
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java
new file mode 100644
index 000000000..027c5b55a
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java
@@ -0,0 +1,21 @@
+package us.codecraft.webmagic.handler;
+
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public interface SubPipeline extends RequestMatcher {
+
+ /**
+ * process the page, extract urls to fetch, extract the data and store
+ *
+ * @param resultItems resultItems
+ * @param task task
+ * @return whether continue to match
+ */
+ public MatchOther processResult(ResultItems resultItems, Task task);
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
index 32f561e29..673447586 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
@@ -1,5 +1,9 @@
package us.codecraft.webmagic.model;
+import lombok.Getter;
+import lombok.Setter;
+
+import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
/**
@@ -7,18 +11,18 @@
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-class Extractor {
+public class Extractor {
+ @Getter @Setter
protected Selector selector;
+ @Getter
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
-
- static enum Source {Html, Url, RawHtml}
-
+
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
@@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult
this.multi = multi;
}
- Selector getSelector() {
- return selector;
- }
-
- Source getSource() {
- return source;
- }
-
- boolean isNotNull() {
+ public boolean isNotNull() {
return notNull;
}
- boolean isMulti() {
+ public boolean isMulti() {
return multi;
}
-
- void setSelector(Selector selector) {
- this.selector = selector;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
index a2cba1332..d4cb5937f 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
@@ -1,58 +1,33 @@
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
+import lombok.Getter;
+import lombok.Setter;
+
/**
* Wrapper of field and extractor.
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-class FieldExtractor extends Extractor {
+public class FieldExtractor extends Extractor {
+ @Getter
private final Field field;
+ @Getter @Setter
private Method setterMethod;
+ @Getter @Setter
private ObjectFormatter objectFormatter;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi);
this.field = field;
}
-
- Field getField() {
- return field;
- }
-
- Selector getSelector() {
- return selector;
- }
-
- Source getSource() {
- return source;
- }
-
- void setSetterMethod(Method setterMethod) {
- this.setterMethod = setterMethod;
- }
-
- Method getSetterMethod() {
- return setterMethod;
- }
-
- boolean isNotNull() {
- return notNull;
- }
-
- ObjectFormatter getObjectFormatter() {
- return objectFormatter;
- }
-
- void setObjectFormatter(ObjectFormatter objectFormatter) {
- this.objectFormatter = objectFormatter;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java
index 8a40dae95..1c1ed6e82 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java
@@ -7,9 +7,7 @@
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -25,7 +23,7 @@ class ModelPageProcessor implements PageProcessor {
private Site site;
- private Set targetUrlPatterns = new HashSet();
+ private boolean extractLinks = true;
public static ModelPageProcessor create(Site site, Class... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
@@ -38,8 +36,6 @@ public static ModelPageProcessor create(Site site, Class... clazzs) {
public ModelPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
- targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
- targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor);
return this;
}
@@ -51,15 +47,20 @@ private ModelPageProcessor(Site site) {
@Override
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
- extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
- extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
+ if (extractLinks) {
+ extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
+ extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
+ }
Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
- page.getResultItems().setSkip(true);
+ continue;
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
+ if (page.getResultItems().getAll().size() == 0) {
+ page.getResultItems().setSkip(true);
+ }
}
private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) {
@@ -67,13 +68,13 @@ private void extractLinks(Page page, Selector urlRegionSelector, List u
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
- links = urlRegionSelector.selectList(page.getHtml().toString());
+ links = page.getHtml().selectList(urlRegionSelector).links().all();
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
- page.addTargetRequest(new Request(matcher.group(1)));
+ page.addTargetRequest(new Request(matcher.group(0)));
}
}
}
@@ -86,4 +87,12 @@ protected void postProcessPageModel(Class clazz, Object object) {
public Site getSite() {
return site;
}
+
+ public boolean isExtractLinks() {
+ return extractLinks;
+ }
+
+ public void setExtractLinks(boolean extractLinks) {
+ this.extractLinks = extractLinks;
+ }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
index 313330820..eaabcca2c 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
@@ -25,7 +25,7 @@
* private String content;
*
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
- * private List tags;
+ * private List<String> tags;
* }
*
* And start the spider by:
@@ -60,9 +60,9 @@ public OOSpider(PageProcessor pageProcessor) {
/**
* create a spider
*
- * @param site
- * @param pageModelPipeline
- * @param pageModels
+ * @param site site
+ * @param pageModelPipeline pageModelPipeline
+ * @param pageModels pageModels
*/
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
this(ModelPageProcessor.create(site, pageModels));
@@ -97,4 +97,9 @@ public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageM
return this;
}
+ public OOSpider setIsExtractLinks(boolean isExtractLinks){
+ modelPageProcessor.setExtractLinks(isExtractLinks);
+ return this;
+ }
+
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java
new file mode 100644
index 000000000..1cc5ac3f4
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java
@@ -0,0 +1,29 @@
+package us.codecraft.webmagic.model;
+
+import us.codecraft.webmagic.Page;
+
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.2
+ */
+public class PageMapper {
+
+ private Class clazz;
+
+ private PageModelExtractor pageModelExtractor;
+
+ public PageMapper(Class clazz) {
+ this.clazz = clazz;
+ this.pageModelExtractor = PageModelExtractor.create(clazz);
+ }
+
+ public T get(Page page) {
+ return (T) pageModelExtractor.process(page);
+ }
+
+ public List getAll(Page page) {
+ return (List) pageModelExtractor.process(page);
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index 5e4da1142..751aafe76 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -3,22 +3,28 @@
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
+import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
-import us.codecraft.webmagic.model.formatter.BasicTypeFormatter;
-import us.codecraft.webmagic.model.formatter.ObjectFormatter;
-import us.codecraft.webmagic.model.formatter.ObjectFormatters;
+import us.codecraft.webmagic.model.fields.PageField;
+import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
+import us.codecraft.webmagic.model.sources.Source;
+import us.codecraft.webmagic.model.sources.SourceTextExtractor;
+import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
+import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
-import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
+import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
+
/**
* The main internal logic of page model extractor.
*
@@ -27,14 +33,19 @@
*/
class PageModelExtractor {
+ @Getter
private List targetUrlPatterns = new ArrayList();
+ @Getter
private Selector targetUrlRegionSelector;
+ @Getter
private List helpUrlPatterns = new ArrayList();
+ @Getter
private Selector helpUrlRegionSelector;
+ @Getter
private Class clazz;
private List fieldExtractors;
@@ -53,7 +64,7 @@ private void init(Class clazz) {
this.clazz = clazz;
initClassExtractors();
fieldExtractors = new ArrayList();
- for (Field field : clazz.getDeclaredFields()) {
+ for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) {
field.setAccessible(true);
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
@@ -69,61 +80,12 @@ private void init(Class clazz) {
fieldExtractor = fieldExtractorTmp;
}
if (fieldExtractor != null) {
- checkFormat(field, fieldExtractor);
+ fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build());
fieldExtractors.add(fieldExtractor);
}
}
}
- private void checkFormat(Field field, FieldExtractor fieldExtractor) {
- if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
- Class> fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType());
- ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz);
- if (objectFormatter == null) {
- throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz);
- } else {
- fieldExtractor.setObjectFormatter(objectFormatter);
- }
- } else if (fieldExtractor.isMulti()) {
- if (!List.class.isAssignableFrom(field.getType())) {
- throw new IllegalStateException("Field " + field.getName() + " must be list");
- }
- Formatter formatter = field.getAnnotation(Formatter.class);
- if (formatter != null) {
- if (!formatter.subClazz().equals(Void.class)) {
- ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz());
- if (objectFormatter == null) {
- throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz());
- } else {
- fieldExtractor.setObjectFormatter(objectFormatter);
- }
- }
- }
- }
- }
-
- private ObjectFormatter getObjectFormatter(Field field, Class> fieldClazz) {
- Formatter formatter = field.getAnnotation(Formatter.class);
- if (formatter != null) {
- if (!formatter.formatter().equals(ObjectFormatter.class)) {
- ObjectFormatter objectFormatter = initFormatter(formatter.formatter());
- objectFormatter.initParam(formatter.value());
- }
- }
- return initFormatter(ObjectFormatters.get(fieldClazz));
- }
-
- private ObjectFormatter initFormatter(Class extends ObjectFormatter> formatterClazz) {
- try {
- return formatterClazz.newInstance();
- } catch (InstantiationException e) {
- logger.error("init ObjectFormatter fail", e);
- } catch (IllegalAccessException e) {
- logger.error("init ObjectFormatter fail", e);
- }
- return null;
- }
-
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
@@ -133,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
- new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
+ new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@@ -159,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
- fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
+ fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@@ -174,12 +136,26 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
- fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
- extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType()));
- Method setterMethod = getSetterMethod(clazz, field);
- if (setterMethod != null) {
- fieldExtractor.setSetterMethod(setterMethod);
+ ExtractBy.Source extractSource = extractBy.source();
+ if (extractBy.type()== ExtractBy.Type.JsonPath)
+ extractSource = RawText;
+ Source source = null;
+ switch (extractSource) {
+ case RawText:
+ source = new RawText();
+ break;
+ case RawHtml:
+ source = new RawHtml();
+ break;
+ case SelectedHtml:
+ source = new SelectedHtml();
+ break;
+ default:
+ source = new SelectedHtml();
}
+ fieldExtractor = new FieldExtractor(field, selector, source,
+ extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
+ fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
}
return fieldExtractor;
}
@@ -198,12 +174,12 @@ public static Method getSetterMethod(Class clazz, Field field) {
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
- targetUrlPatterns.add(Pattern.compile("(.*)"));
+ targetUrlPatterns.add(Pattern.compile(".*"));
} else {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
- targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
+ targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
@@ -214,7 +190,7 @@ private void initClassExtractors() {
HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) {
- helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
+ helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
}
if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
@@ -223,7 +199,7 @@ private void initClassExtractors() {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
- objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
+ objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
}
}
@@ -263,131 +239,15 @@ private Object processSingle(Page page, String html, boolean isRaw) {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
- if (fieldExtractor.isMulti()) {
- List value;
- switch (fieldExtractor.getSource()) {
- case RawHtml:
- value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
- break;
- case Html:
- if (isRaw) {
- value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
- } else {
- value = fieldExtractor.getSelector().selectList(html);
- }
- break;
- case Url:
- value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
- break;
- default:
- value = fieldExtractor.getSelector().selectList(html);
- }
- if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
- return null;
- }
- if (fieldExtractor.getObjectFormatter() != null) {
- List converted = convert(value, fieldExtractor.getObjectFormatter());
- setField(o, fieldExtractor, converted);
- } else {
- setField(o, fieldExtractor, value);
- }
- } else {
- String value;
- switch (fieldExtractor.getSource()) {
- case RawHtml:
- value = page.getHtml().selectDocument(fieldExtractor.getSelector());
- break;
- case Html:
- if (isRaw) {
- value = page.getHtml().selectDocument(fieldExtractor.getSelector());
- } else {
- value = fieldExtractor.getSelector().select(html);
- }
- break;
- case Url:
- value = fieldExtractor.getSelector().select(page.getUrl().toString());
- break;
- default:
- value = fieldExtractor.getSelector().select(html);
- }
- if (value == null && fieldExtractor.isNotNull()) {
- return null;
- }
- if (fieldExtractor.getObjectFormatter() != null) {
- Object converted = convert(value, fieldExtractor.getObjectFormatter());
- if (converted == null && fieldExtractor.isNotNull()) {
- return null;
- }
- setField(o, fieldExtractor, converted);
- } else {
- setField(o, fieldExtractor, value);
- }
- }
+ PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
+ if (!field.operation(o, fieldExtractor, logger))
+ return null;
}
- if (AfterExtractor.class.isAssignableFrom(clazz)) {
+ if (AfterExtractor.class.isAssignableFrom(clazz))
((AfterExtractor) o).afterProcess(page);
- }
- } catch (InstantiationException e) {
- logger.error("extract fail", e);
- } catch (IllegalAccessException e) {
- logger.error("extract fail", e);
- } catch (InvocationTargetException e) {
+ } catch (Exception e) {
logger.error("extract fail", e);
}
return o;
}
-
- private Object convert(String value, ObjectFormatter objectFormatter) {
- try {
- Object format = objectFormatter.format(value);
- if (logger.isDebugEnabled()) {
- logger.debug("String " + value + " is converted to " + format);
- }
- return format;
- } catch (Exception e) {
- logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
- }
- return null;
- }
-
- private List convert(List values, ObjectFormatter objectFormatter) {
- List objects = new ArrayList();
- for (String value : values) {
- Object converted = convert(value, objectFormatter);
- if (converted != null) {
- objects.add(converted);
- }
- }
- return objects;
- }
-
- private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
- if (value == null) {
- return;
- }
- if (fieldExtractor.getSetterMethod() != null) {
- fieldExtractor.getSetterMethod().invoke(o, value);
- }
- fieldExtractor.getField().set(o, value);
- }
-
- Class getClazz() {
- return clazz;
- }
-
- List getTargetUrlPatterns() {
- return targetUrlPatterns;
- }
-
- List getHelpUrlPatterns() {
- return helpUrlPatterns;
- }
-
- Selector getTargetUrlRegionSelector() {
- return targetUrlRegionSelector;
- }
-
- Selector getHelpUrlRegionSelector() {
- return helpUrlRegionSelector;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
index 2e23aa009..8e02895a1 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@@ -52,7 +52,8 @@ public static enum Source {
/**
* extract from the raw html
*/
- RawHtml
+ RawHtml,
+ RawText
}
/**
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java
index e603c59ff..eb7ecb488 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java
@@ -16,17 +16,19 @@
@Target({ElementType.FIELD})
public @interface Formatter {
+ Class DEFAULT_FORMATTER = ObjectFormatter.class;
+
/**
* Set formatter params.
*
* @return formatter params
*/
- String[] value();
+ String[] value() default "";
/**
- * Specific the class of field of class of elements in collection for field.
+ * Specific the class of field of class of elements in collection for field.
* It is not necessary to be set because we can detect the class by class of field,
- * unless you use a collection as a field.
+ * unless you use a collection as a field.
*
* @return the class of field
*/
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java
new file mode 100644
index 000000000..4a4bf38a8
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java
@@ -0,0 +1,42 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+
+import lombok.Getter;
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+
+public class MultipleField extends PageField {
+ @Getter
+ private List fieldNames;
+
+ public MultipleField(List fieldNames) {
+ this.fieldNames = fieldNames;
+ }
+
+ public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
+ if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
+ return false;
+ if (fieldExtractor.getObjectFormatter() != null) {
+ List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
+ setField(o, fieldExtractor, converted);
+ }
+ else
+ setField(o, fieldExtractor, this.fieldNames);
+ return true;
+ }
+
+ private List convert(List values, ObjectFormatter objectFormatter, Logger logger) {
+ List objects = new ArrayList<>();
+ for (String value : values) {
+ Object converted = this.convert(value, objectFormatter, logger);
+ if (converted != null)
+ objects.add(converted);
+ }
+ return objects;
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java
new file mode 100644
index 000000000..ad4428335
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java
@@ -0,0 +1,31 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+
+import org.slf4j.Logger;
+
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+
+public abstract class PageField {
+ public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
+
+ protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
+ try {
+ Object format = objectFormatter.format(value);
+ logger.debug("String {} is converted to {}", value, format);
+ return format;
+ } catch (Exception e) {
+ logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
+ }
+ return null;
+ }
+
+ protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
+ if (value != null) {
+ if (fieldExtractor.getSetterMethod() != null)
+ fieldExtractor.getSetterMethod().invoke(o, value);
+ fieldExtractor.getField().set(o, value);
+ }
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java
new file mode 100644
index 000000000..136a1c56e
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java
@@ -0,0 +1,28 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+
+import org.slf4j.Logger;
+
+import lombok.Getter;
+import us.codecraft.webmagic.model.FieldExtractor;
+
+public class SingleField extends PageField {
+ @Getter
+ private String fieldName;
+
+ public SingleField(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
+ if (fieldExtractor.getObjectFormatter() != null) {
+ Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
+ if (converted == null && fieldExtractor.isNotNull())
+ return false;
+ setField(o, fieldExtractor, converted);
+ } else
+ setField(o, fieldExtractor, this.fieldName);
+ return true;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java
new file mode 100644
index 000000000..f03b8864a
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.model.formatter;
+
+public interface BasicClassDetector {
+ Class> detectBasicClass(Class> type);
+}
+
+class IntegerClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
+ return Integer.class;
+ }
+ return null;
+ }
+}
+
+class LongClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Long.TYPE) || type.equals(Long.class)) {
+ return Long.class;
+ }
+ return null;
+ }
+}
+
+class DoubleClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Double.TYPE) || type.equals(Double.class)) {
+ return Double.class;
+ }
+ return null;
+ }
+}
+
+class FloatClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Float.TYPE) || type.equals(Float.class)) {
+ return Float.class;
+ }
+ return null;
+ }
+}
+
+class ShortClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Short.TYPE) || type.equals(Short.class)) {
+ return Short.class;
+ }
+ return null;
+ }
+}
+
+class CharacterClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Character.TYPE) || type.equals(Character.class)) {
+ return Character.class;
+ }
+ return null;
+ }
+}
+
+class ByteClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
+ return Byte.class;
+ }
+ return null;
+ }
+}
+
+class BooleanClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
+ return Boolean.class;
+ }
+ return null;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
index f9d76a845..2d4d85b0a 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
@@ -24,28 +24,24 @@ public T format(String raw) throws Exception {
}
protected abstract T formatTrimmed(String raw) throws Exception;
-
public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
+ public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(),
+ new LongClassDetector(),
+ new FloatClassDetector(),
+ new DoubleClassDetector(),
+ new ShortClassDetector(),
+ new ByteClassDetector(),
+ new BooleanClassDetector(),
+ new CharacterClassDetector());
public static Class> detectBasicClass(Class> type) {
- if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
- return Integer.class;
- } else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
- return Long.class;
- } else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
- return Double.class;
- } else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
- return Float.class;
- } else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
- return Short.class;
- } else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
- return Character.class;
- } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
- return Byte.class;
- } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
- return Boolean.class;
+ for (BasicClassDetector detector : basicClassDetector) {
+ Class> detectedClass = detector.detectBasicClass(type);
+ if (detectedClass != null) {
+ return detectedClass;
+ }
}
return type;
}
@@ -146,5 +142,4 @@ public Class clazz() {
}
}
-
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java
index b0f6e7713..6305d7bd6 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java
@@ -10,7 +10,8 @@
*/
public class DateFormatter implements ObjectFormatter {
- private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"};
+ public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"};
+ private String[] datePatterns = DEFAULT_PATTERN;
@Override
public Date format(String raw) throws Exception {
@@ -24,6 +25,8 @@ public Class clazz() {
@Override
public void initParam(String[] extra) {
- datePatterns = extra;
+ if (extra != null && !(extra.length == 1 && extra[0].length() == 0)) {
+ datePatterns = extra;
+ }
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java
new file mode 100644
index 000000000..4c32dfc62
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java
@@ -0,0 +1,56 @@
+package us.codecraft.webmagic.model.formatter;
+
+import us.codecraft.webmagic.model.annotation.Formatter;
+
+import java.lang.reflect.Field;
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.7.0
+ * Date: 2017/6/3
+ */
+public class ObjectFormatterBuilder {
+
+ private Field field;
+
+ public ObjectFormatterBuilder setField(Field field) {
+ this.field = field;
+ return this;
+ }
+
+ private ObjectFormatter initFormatterForType(Class> fieldClazz, String[] params) {
+ if (fieldClazz.equals(String.class) || List.class.isAssignableFrom(fieldClazz)){
+ return null;
+ }
+ Class extends ObjectFormatter> formatterClass = ObjectFormatters.get(BasicTypeFormatter.detectBasicClass(fieldClazz));
+ if (formatterClass == null) {
+ throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz);
+ }
+ return initFormatter(formatterClass, params);
+ }
+
+ private ObjectFormatter initFormatter(Class extends ObjectFormatter> formatterClazz, String[] params) {
+ try {
+ ObjectFormatter objectFormatter = formatterClazz.newInstance();
+ objectFormatter.initParam(params);
+ return objectFormatter;
+ } catch (InstantiationException e) {
+ throw new RuntimeException(e);
+ } catch (IllegalAccessException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public ObjectFormatter build() {
+ Formatter formatter = field.getAnnotation(Formatter.class);
+ if (formatter != null && !formatter.formatter().equals(Formatter.DEFAULT_FORMATTER)) {
+ return initFormatter(formatter.formatter(), formatter.value());
+ }
+ if (formatter == null || formatter.subClazz().equals(Void.class)) {
+ return initFormatterForType(field.getType(), formatter != null ? formatter.value() : null);
+ } else {
+ return initFormatterForType(formatter.subClazz(), formatter.value());
+ }
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
index 7534e5ea8..42747e718 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
@@ -22,9 +22,9 @@ public static void put(Class extends ObjectFormatter> objectFormatter) {
try {
formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter);
} catch (InstantiationException e) {
- e.printStackTrace();
+ throw new RuntimeException(e);
} catch (IllegalAccessException e) {
- e.printStackTrace();
+ throw new RuntimeException(e);
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java
new file mode 100644
index 000000000..146827220
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java
@@ -0,0 +1,68 @@
+package us.codecraft.webmagic.model.sources;
+
+import java.util.List;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.model.FieldExtractor;
+
+public interface Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
+
+ public class RawHtml implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return page.getHtml().selectDocument(fieldExtractor.getSelector());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
+ }
+ }
+
+ public class SelectedHtml implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ if (isRaw)
+ return page.getHtml().selectDocument(fieldExtractor.getSelector());
+ else
+ return fieldExtractor.getSelector().select(html);
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ if (isRaw)
+ return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
+ else
+ return fieldExtractor.getSelector().selectList(html);
+ }
+ }
+
+ public class Url implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(page.getUrl().toString());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(page.getUrl().toString());
+ }
+ }
+
+ public class RawText implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(page.getRawText());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(page.getRawText());
+ }
+ }
+
+ public class DefaultSource implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(html);
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(html);
+ }
+ }
+}
+
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java
new file mode 100644
index 000000000..1e572695f
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java
@@ -0,0 +1,17 @@
+package us.codecraft.webmagic.model.sources;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.fields.MultipleField;
+import us.codecraft.webmagic.model.fields.PageField;
+import us.codecraft.webmagic.model.fields.SingleField;
+
+public class SourceTextExtractor {
+ public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ Source source = fieldExtractor.getSource();
+ if (fieldExtractor.isMulti())
+ return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
+ else
+ return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
new file mode 100644
index 000000000..50dbcaf1a
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
@@ -0,0 +1,116 @@
+package us.codecraft.webmagic.monitor;
+
+import java.lang.management.ManagementFactory;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import javax.management.InstanceAlreadyExistsException;
+import javax.management.JMException;
+import javax.management.MBeanRegistrationException;
+import javax.management.MBeanServer;
+import javax.management.MalformedObjectNameException;
+import javax.management.NotCompliantMBeanException;
+import javax.management.ObjectName;
+
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.SpiderListener;
+import us.codecraft.webmagic.utils.Experimental;
+import us.codecraft.webmagic.utils.UrlUtils;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+@Experimental
+public class SpiderMonitor {
+
+ private static final SpiderMonitor INSTANCE = new SpiderMonitor();
+
+ private MBeanServer mbeanServer;
+
+ private String jmxServerName;
+
+ private List spiderStatuses = new ArrayList<>();
+
+ protected SpiderMonitor() {
+ jmxServerName = "WebMagic";
+ mbeanServer = ManagementFactory.getPlatformMBeanServer();
+ }
+
+ /**
+ * Register spider for monitor.
+ *
+ * @param spiders spiders
+ * @return this
+ * @throws JMException JMException
+ */
+ public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
+ for (Spider spider : spiders) {
+ MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
+ if (spider.getSpiderListeners() == null) {
+ List spiderListeners = new ArrayList<>();
+ spiderListeners.add(monitorSpiderListener);
+ spider.setSpiderListeners(spiderListeners);
+ } else {
+ spider.getSpiderListeners().add(monitorSpiderListener);
+ }
+ SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener);
+ registerMBean(spiderStatusMBean);
+ spiderStatuses.add(spiderStatusMBean);
+ }
+ return this;
+ }
+
+ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
+ return new SpiderStatus(spider, monitorSpiderListener);
+ }
+
+ protected List getSpiderStatuses() {
+ return this.spiderStatuses;
+ }
+
+ public static SpiderMonitor instance() {
+ return INSTANCE;
+ }
+
+ public class MonitorSpiderListener implements SpiderListener {
+
+ private final AtomicInteger successCount = new AtomicInteger(0);
+
+ private final AtomicInteger errorCount = new AtomicInteger(0);
+
+ private List errorUrls = Collections.synchronizedList(new ArrayList());
+
+ @Override
+ public void onSuccess(Request request) {
+ successCount.incrementAndGet();
+ }
+
+ @Override
+ public void onError(Request request, Exception e) {
+ errorUrls.add(request.getUrl());
+ errorCount.incrementAndGet();
+ }
+
+ public AtomicInteger getSuccessCount() {
+ return successCount;
+ }
+
+ public AtomicInteger getErrorCount() {
+ return errorCount;
+ }
+
+ public List getErrorUrls() {
+ return errorUrls;
+ }
+ }
+
+ protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
+ ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
+ mbeanServer.registerMBean(spiderStatus, objName);
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
new file mode 100644
index 000000000..69afe042a
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
@@ -0,0 +1,96 @@
+package us.codecraft.webmagic.monitor;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.scheduler.MonitorableScheduler;
+
+import java.util.Date;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public class SpiderStatus implements SpiderStatusMXBean {
+
+ protected final Spider spider;
+
+ protected Logger logger = LoggerFactory.getLogger(getClass());
+
+ protected final SpiderMonitor.MonitorSpiderListener monitorSpiderListener;
+
+ public SpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) {
+ this.spider = spider;
+ this.monitorSpiderListener = monitorSpiderListener;
+ }
+
+ public String getName() {
+ return spider.getUUID();
+ }
+
+ public int getLeftPageCount() {
+ if (spider.getScheduler() instanceof MonitorableScheduler) {
+ return ((MonitorableScheduler) spider.getScheduler()).getLeftRequestsCount(spider);
+ }
+ logger.warn("Get leftPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!");
+ return -1;
+ }
+
+ public int getTotalPageCount() {
+ if (spider.getScheduler() instanceof MonitorableScheduler) {
+ return ((MonitorableScheduler) spider.getScheduler()).getTotalRequestsCount(spider);
+ }
+ logger.warn("Get totalPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!");
+ return -1;
+ }
+
+ @Override
+ public int getSuccessPageCount() {
+ return monitorSpiderListener.getSuccessCount().get();
+ }
+
+ @Override
+ public int getErrorPageCount() {
+ return monitorSpiderListener.getErrorCount().get();
+ }
+
+ public List getErrorPages() {
+ return monitorSpiderListener.getErrorUrls();
+ }
+
+ @Override
+ public String getStatus() {
+ return spider.getStatus().name();
+ }
+
+ @Override
+ public int getThread() {
+ return spider.getThreadAlive();
+ }
+
+ public void start() {
+ spider.start();
+ }
+
+ public void stop() {
+ spider.stop();
+ }
+
+ @Override
+ public Date getStartTime() {
+ return spider.getStartTime();
+ }
+
+ @Override
+ public int getPagePerSecond() {
+ if (getStartTime() != null) {
+ int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
+ if (runSeconds != 0) {
+ return getSuccessPageCount() / runSeconds;
+ }
+ }
+ return -1;
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java
new file mode 100644
index 000000000..e49ff8fc4
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java
@@ -0,0 +1,35 @@
+package us.codecraft.webmagic.monitor;
+
+import java.util.Date;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public interface SpiderStatusMXBean {
+
+ public String getName();
+
+ public String getStatus();
+
+ public int getThread();
+
+ public int getTotalPageCount();
+
+ public int getLeftPageCount();
+
+ public int getSuccessPageCount();
+
+ public int getErrorPageCount();
+
+ public List getErrorPages();
+
+ public void start();
+
+ public void stop();
+
+ public Date getStartTime();
+
+ public int getPagePerSecond();
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
index c4826e2ba..0db9b819d 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
@@ -37,7 +37,7 @@ public FilePageModelPipeline(String path) {
@Override
public void process(Object o, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
+ String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
String filename;
if (o instanceof HasKey) {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
index 1583b0cab..7a7f80a25 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
@@ -38,7 +38,7 @@ public JsonFilePageModelPipeline(String path) {
@Override
public void process(Object o, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
+ String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
String filename;
if (o instanceof HasKey) {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
index b6c55afc3..3ff42bf10 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
@@ -8,7 +8,6 @@
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
-import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
@@ -36,9 +35,9 @@ public JsonFilePipeline(String path) {
@Override
public void process(ResultItems resultItems, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
+ String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
- PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
+ PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (IOException e) {
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
index 5806602c7..32d83541e 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
@@ -36,51 +36,61 @@ public void process(ResultItems resultItems, Task task) {
private void handleObject(Iterator> iterator) {
Map.Entry objectEntry = iterator.next();
Object o = objectEntry.getValue();
+ //需要拼凑
if (o instanceof MultiPageModel) {
MultiPageModel multiPageModel = (MultiPageModel) o;
- pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
- if (multiPageModel.getOtherPages() != null) {
- for (String otherPage : multiPageModel.getOtherPages()) {
- Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
- if (aBoolean == null) {
- pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
+ //这次处理的部分,设置为完成
+ pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.FALSE);
+ //每个key单独加锁
+ synchronized (pageMap.get(multiPageModel.getPageKey())) {
+ pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
+ //其他需要拼凑的部分
+ if (multiPageModel.getOtherPages() != null) {
+ for (String otherPage : multiPageModel.getOtherPages()) {
+ Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
+ if (aBoolean == null) {
+ pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
+ }
}
}
- }
- //check if all pages are processed
- Map booleanMap = pageMap.get(multiPageModel.getPageKey());
- objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
- if (booleanMap == null) {
- return;
- }
- for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) {
- if (!stringBooleanEntry.getValue()) {
- iterator.remove();
+ //check if all pages are processed
+ Map booleanMap = pageMap.get(multiPageModel.getPageKey());
+ objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
+ if (booleanMap == null) {
return;
}
- }
- List> entryList = new ArrayList>();
- entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
- if (entryList.size() != 0) {
- Collections.sort(entryList, new Comparator>() {
- @Override
- public int compare(Map.Entry o1, Map.Entry o2) {
- try {
- int i1 = Integer.parseInt(o1.getKey());
- int i2 = Integer.parseInt(o2.getKey());
- return i1 - i2;
- } catch (NumberFormatException e) {
- return o1.getKey().compareTo(o2.getKey());
+ // /过滤,这次完成的page item中,还未拼凑完整的item,不进入下一个pipeline
+ for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) {
+ if (!stringBooleanEntry.getValue()) {
+ iterator.remove();
+ return;
+ }
+ }
+ List> entryList = new ArrayList>();
+ entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
+ if (entryList.size() != 0) {
+ Collections.sort(entryList, new Comparator>() {
+ @Override
+ public int compare(Map.Entry o1, Map.Entry o2) {
+ try {
+ int i1 = Integer.parseInt(o1.getKey());
+ int i2 = Integer.parseInt(o2.getKey());
+ return i1 - i2;
+ } catch (NumberFormatException e) {
+ return o1.getKey().compareTo(o2.getKey());
+ }
}
+ });
+ // 合并
+ MultiPageModel value = entryList.get(0).getValue();
+ for (int i = 1; i < entryList.size(); i++) {
+ value = value.combine(entryList.get(i).getValue());
}
- });
- MultiPageModel value = entryList.get(0).getValue();
- for (int i = 1; i < entryList.size(); i++) {
- value = value.combine(entryList.get(i).getValue());
+ objectEntry.setValue(value);
}
- objectEntry.setValue(value);
}
}
+
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java
new file mode 100644
index 000000000..db8430237
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java
@@ -0,0 +1,77 @@
+package us.codecraft.webmagic.scheduler;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 16/12/18
+ * Time: 上午10:23
+ */
+
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.nio.charset.Charset;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * BloomFilterDuplicateRemover for huge number of urls.
+ *
+ * @author code4crafer@gmail.com
+ * @since 0.5.1
+ */
+public class BloomFilterDuplicateRemover implements DuplicateRemover {
+
+ private int expectedInsertions;
+
+ private double fpp;
+
+ private AtomicInteger counter;
+
+ public BloomFilterDuplicateRemover(int expectedInsertions) {
+ this(expectedInsertions, 0.01);
+ }
+
+ /**
+ *
+ * @param expectedInsertions the number of expected insertions to the constructed
+ * @param fpp the desired false positive probability (must be positive and less than 1.0)
+ */
+ public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
+ this.expectedInsertions = expectedInsertions;
+ this.fpp = fpp;
+ this.bloomFilter = rebuildBloomFilter();
+ }
+
+ protected BloomFilter rebuildBloomFilter() {
+ counter = new AtomicInteger(0);
+ return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
+ }
+
+ private final BloomFilter bloomFilter;
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
+ if (!isDuplicate) {
+ bloomFilter.put(getUrl(request));
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ protected String getUrl(Request request) {
+ return request.getUrl();
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ rebuildBloomFilter();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index 38e8a799a..0dabdd954 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -1,31 +1,22 @@
package us.codecraft.webmagic.scheduler;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.io.*;
-import java.util.LinkedHashSet;
-import java.util.Set;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.Executors;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
+
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown.
*
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class FileCacheQueueScheduler implements Scheduler {
-
- private Logger logger = LoggerFactory.getLogger(getClass());
+public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable {
private String filePath = System.getProperty("java.io.tmpdir");
@@ -45,13 +36,14 @@ public class FileCacheQueueScheduler implements Scheduler {
private BlockingQueue queue;
- private Set urls;
+ private ScheduledExecutorService flushThreadPool;
public FileCacheQueueScheduler(String filePath) {
if (!filePath.endsWith("/") && !filePath.endsWith("\\")) {
filePath += "/";
}
this.filePath = filePath;
+ initDuplicateRemover();
}
private void flush() {
@@ -72,13 +64,14 @@ private void init(Task task) {
logger.info("init cache scheduler success");
}
+ private void initDuplicateRemover() {
+ BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode());
+ setDuplicateRemover(bloomFilterDuplicateRemover);
+ }
+
private void initFlushThread() {
- Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
- @Override
- public void run() {
- flush();
- }
- }, 10, 10, TimeUnit.SECONDS);
+ flushThreadPool = Executors.newScheduledThreadPool(1);
+ flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS);
}
private void initWriter() {
@@ -93,9 +86,9 @@ private void initWriter() {
private void readFile() {
try {
queue = new LinkedBlockingQueue();
- urls = new LinkedHashSet();
readCursorFile();
readUrlFile();
+ // initDuplicateRemover();
} catch (FileNotFoundException e) {
//init
logger.info("init cache file " + getFileName(fileUrlAllName));
@@ -105,58 +98,55 @@ private void readFile() {
}
private void readUrlFile() throws IOException {
- String line;
- BufferedReader fileUrlReader = null;
- try {
- fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
+ try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) {
+ String line;
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
- urls.add(line.trim());
+ Request request = deserializeRequest(line);
+ this.getDuplicateRemover().isDuplicate(request, null);
lineReaded++;
if (lineReaded > cursor.get()) {
- queue.add(new Request(line));
+ queue.add(request);
}
}
- } finally {
- if (fileUrlReader != null) {
- IOUtils.closeQuietly(fileUrlReader);
- }
}
}
private void readCursorFile() throws IOException {
- BufferedReader fileCursorReader = null;
- try {
- fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
+ String fileName = getFileName(fileCursor);
+ try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) {
String line;
+ String lastLine = null;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
- cursor = new AtomicInteger(NumberUtils.toInt(line));
+ line = line.trim();
+ if (!line.isEmpty()) {
+ lastLine = line;
+ }
}
- } finally {
- if (fileCursorReader != null) {
- IOUtils.closeQuietly(fileCursorReader);
+ if (lastLine != null) {
+ cursor.set(NumberUtils.toInt(line));
}
}
}
+ public void close() throws IOException {
+ flushThreadPool.shutdown();
+ fileUrlWriter.close();
+ fileCursorWriter.close();
+ }
+
private String getFileName(String filename) {
return filePath + task.getUUID() + filename;
}
@Override
- public synchronized void push(Request request, Task task) {
+ protected void pushWhenNoDuplicate(Request request, Task task) {
if (!inited.get()) {
init(task);
}
- if (logger.isDebugEnabled()) {
- logger.debug("push to queue " + request.getUrl());
- }
- if (urls.add(request.getUrl())) {
- queue.add(request);
- fileUrlWriter.println(request.getUrl());
- }
-
+ queue.add(request);
+ fileUrlWriter.println(serializeRequest(request));
}
@Override
@@ -167,4 +157,23 @@ public synchronized Request poll(Task task) {
fileCursorWriter.println(cursor.incrementAndGet());
return queue.poll();
}
+
+ @Override
+ public int getLeftRequestsCount(Task task) {
+ return queue.size();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return getDuplicateRemover().getTotalRequestsCount(task);
+ }
+
+ protected String serializeRequest(Request request) {
+ return request.getUrl();
+ }
+
+ protected Request deserializeRequest(String line) {
+ return new Request(line);
+ }
+
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
new file mode 100644
index 000000000..7abe5bfad
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
@@ -0,0 +1,121 @@
+package us.codecraft.webmagic.scheduler;
+
+import java.util.Set;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
+import redis.clients.jedis.Jedis;
+import redis.clients.jedis.JedisPool;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+
+/**
+ * the redis scheduler with priority
+ * @author sai
+ * Created by sai on 16-5-27.
+ */
+public class RedisPriorityScheduler extends RedisScheduler {
+
+ private static final String ZSET_PREFIX = "zset_";
+
+ private static final String QUEUE_PREFIX = "queue_";
+
+ private static final String NO_PRIORITY_SUFFIX = "_zore";
+
+ private static final String PLUS_PRIORITY_SUFFIX = "_plus";
+
+ private static final String MINUS_PRIORITY_SUFFIX = "_minus";
+
+ public RedisPriorityScheduler(String host) {
+ super(host);
+ }
+
+ public RedisPriorityScheduler(JedisPool pool) {
+ super(pool);
+ }
+
+ @Override
+ protected void pushWhenNoDuplicate(Request request, Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ if (request.getPriority() > 0) {
+ jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
+ } else if (request.getPriority() < 0) {
+ jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
+ } else {
+ jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
+ }
+
+ setExtrasInItem(jedis, request, task);
+ }
+ }
+
+ @Override
+ public synchronized Request poll(Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ String url = getRequest(jedis, task);
+ if (StringUtils.isBlank(url)) {
+ return null;
+ }
+ return getExtrasInItem(jedis, url, task);
+ }
+ }
+
+ private String getRequest(Jedis jedis, Task task) {
+ String url;
+ Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
+ if (urls.isEmpty()) {
+ url = jedis.lpop(getQueueNoPriorityKey(task));
+ if (StringUtils.isBlank(url)) {
+ urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
+ if (!urls.isEmpty()) {
+ url = urls.toArray(new String[0])[0];
+ jedis.zrem(getZsetMinusPriorityKey(task), url);
+ }
+ }
+ } else {
+ url = urls.toArray(new String[0])[0];
+ jedis.zrem(getZsetPlusPriorityKey(task), url);
+ }
+ return url;
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ jedis.del(getSetKey(task));
+ }
+ }
+
+ private String getZsetPlusPriorityKey(Task task) {
+ return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
+ }
+
+ private String getQueueNoPriorityKey(Task task) {
+ return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
+ }
+
+ private String getZsetMinusPriorityKey(Task task) {
+ return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
+ }
+
+ private void setExtrasInItem(Jedis jedis,Request request, Task task) {
+ if (!request.getExtras().isEmpty()) {
+ String field = DigestUtils.sha1Hex(request.getUrl());
+ String value = JSON.toJSONString(request);
+ jedis.hset(getItemKey(task), field, value);
+ }
+ }
+
+ private Request getExtrasInItem(Jedis jedis, String url, Task task) {
+ String key = getItemKey(task);
+ String field = DigestUtils.sha1Hex(url);
+ byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
+ if (bytes != null) {
+ return JSON.parseObject(new String(bytes), Request.class);
+ }
+ return new Request(url);
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index cd9062556..8d61bea3b 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -1,12 +1,16 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* Use Redis as url scheduler for distributed crawlers.
@@ -14,9 +18,9 @@
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class RedisScheduler implements Scheduler {
+public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
- private JedisPool pool;
+ protected JedisPool pool;
private static final String QUEUE_PREFIX = "queue_";
@@ -25,45 +29,80 @@ public class RedisScheduler implements Scheduler {
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
- pool = new JedisPool(new JedisPoolConfig(), host);
+ this(new JedisPool(new JedisPoolConfig(), host));
}
public RedisScheduler(JedisPool pool) {
this.pool = pool;
+ setDuplicateRemover(this);
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ jedis.del(getSetKey(task));
+ }
+ }
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
+ }
+
}
@Override
- public synchronized void push(Request request, Task task) {
+ protected void pushWhenNoDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
- // if cycleRetriedTimes is set, allow duplicated.
- Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES);
- // use set to remove duplicate url
- if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
- // use list to store queue
- jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
- jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
- if (request.getExtras() != null) {
- String field = DigestUtils.shaHex(request.getUrl());
- String value = JSON.toJSONString(request);
- jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
- }
+ jedis.rpush(getQueueKey(task), request.getUrl());
+ if (checkForAdditionalInfo(request)) {
+ String field = DigestUtils.sha1Hex(request.getUrl());
+ String value = JSON.toJSONString(request);
+ jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
} finally {
- pool.returnResource(jedis);
+ jedis.close();
}
}
+ private boolean checkForAdditionalInfo(Request request) {
+ if (request == null) {
+ return false;
+ }
+
+ if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) {
+ return true;
+ }
+
+ if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) {
+ return true;
+ }
+
+ if (request.isBinaryContent() || request.getRequestBody() != null) {
+ return true;
+ }
+
+ if (!request.getExtras().isEmpty()) {
+ return true;
+ }
+ if (request.getPriority() != 0L) {
+ return true;
+ }
+
+ return false;
+ }
+
@Override
public synchronized Request poll(Task task) {
- Jedis jedis = pool.getResource();
- try {
- String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
+ try (Jedis jedis = pool.getResource()) {
+ String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
@@ -71,8 +110,34 @@ public synchronized Request poll(Task task) {
}
Request request = new Request(url);
return request;
- } finally {
- pool.returnResource(jedis);
+ }
+ }
+
+ protected String getSetKey(Task task) {
+ return SET_PREFIX + task.getUUID();
+ }
+
+ protected String getQueueKey(Task task) {
+ return QUEUE_PREFIX + task.getUUID();
+ }
+
+ protected String getItemKey(Task task) {
+ return ITEM_PREFIX + task.getUUID();
+ }
+
+ @Override
+ public int getLeftRequestsCount(Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ Long size = jedis.llen(getQueueKey(task));
+ return size.intValue();
+ }
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ Long size = jedis.scard(getSetKey(task));
+ return size.intValue();
}
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java
new file mode 100644
index 000000000..ed22a4e9b
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java
@@ -0,0 +1,26 @@
+package us.codecraft.webmagic.utils;
+
+import java.lang.reflect.Field;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.5.0
+ */
+public abstract class ClassUtils {
+
+ public static Set getFieldsIncludeSuperClass(Class clazz) {
+ Set fields = new LinkedHashSet();
+ Class current = clazz;
+ while (current != null) {
+ Field[] currentFields = current.getDeclaredFields();
+ for (Field currentField : currentFields) {
+ fields.add(currentField);
+ }
+ current = current.getSuperclass();
+ }
+ return fields;
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
old mode 100755
new mode 100644
index 92c05c8d9..7695c66f7
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
@@ -30,7 +30,8 @@ private void init() {
/**
* init map with protoMapClass
*
- * @param protoMapClass
+ * @param map the origin map to contains the DoubleKeyMap
+ * @param protoMapClass protoMapClass
*/
@SuppressWarnings("rawtypes")
public DoubleKeyMap(Map> map, Class extends Map> protoMapClass) {
@@ -40,7 +41,7 @@ public DoubleKeyMap(Map> map, Class extends Map> protoMapClass)
}
/**
- * @param key
+ * @param key key
* @return map
*/
public Map get(K1 key) {
@@ -48,8 +49,8 @@ public Map get(K1 key) {
}
/**
- * @param key1
- * @param key2
+ * @param key1 key1
+ * @param key2 key2
* @return value
*/
public V get(K1 key1, K2 key2) {
@@ -61,8 +62,8 @@ public V get(K1 key1, K2 key2) {
/**
- * @param key1
- * @param submap
+ * @param key1 key1
+ * @param submap submap
* @return value
*/
public V put(K1 key1, Map submap) {
@@ -70,24 +71,25 @@ public V put(K1 key1, Map submap) {
}
/**
- * @param key1
- * @param key2
- * @param value
+ * @param key1 key1
+ * @param key2 key2
+ * @param value value
* @return value
*/
- public V put(K1 key1, K2 key2, V value) {
+ public synchronized V put(K1 key1, K2 key2, V value) {
if (map.get(key1) == null) {
+ //不加锁的话,多个线程有可能都会执行到这里
map.put(key1, this.newMap());
}
return get(key1).put(key2, value);
}
/**
- * @param key1
- * @param key2
+ * @param key1 key1
+ * @param key2 key2
* @return value
*/
- public V remove(K1 key1, K2 key2) {
+ public synchronized V remove(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
@@ -99,8 +101,8 @@ public V remove(K1 key1, K2 key2) {
}
/**
- * @param key1
- * @return
+ * @param key1 key1
+ * @return map
*/
public Map remove(K1 key1) {
Map remove = map.remove(key1);
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
index 0818fde27..d3fc42313 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
@@ -25,23 +25,13 @@ public static Selector getSelector(ExtractBy extractBy) {
selector = new RegexSelector(value);
break;
case XPath:
- selector = getXpathSelector(value);
+ selector = new XpathSelector(value);
break;
case JsonPath:
selector = new JsonPathSelector(value);
break;
default:
- selector = getXpathSelector(value);
- }
- return selector;
- }
-
- private static Selector getXpathSelector(String value) {
- Selector selector;
- if (EnvironmentUtil.useXsoup()) {
- selector = new XsoupSelector(value);
- } else {
- selector = new XpathSelector(value);
+ selector = new XpathSelector(value);
}
return selector;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
new file mode 100644
index 000000000..3d416964b
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
@@ -0,0 +1,36 @@
+package us.codecraft.webmagic.utils;
+
+import java.net.Inet6Address;
+import java.net.InetAddress;
+import java.net.NetworkInterface;
+import java.net.SocketException;
+import java.util.Enumeration;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public abstract class IPUtils {
+
+ public static String getFirstNoLoopbackIPAddresses() throws SocketException {
+
+ Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces();
+
+ InetAddress localAddress = null;
+ while (networkInterfaces.hasMoreElements()) {
+ NetworkInterface networkInterface = networkInterfaces.nextElement();
+ Enumeration inetAddresses = networkInterface.getInetAddresses();
+ while (inetAddresses.hasMoreElements()) {
+ InetAddress address = inetAddresses.nextElement();
+ if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) {
+ return address.getHostAddress();
+ } else if (!address.isLoopbackAddress()) {
+ localAddress = address;
+ }
+ }
+ }
+
+ return localAddress.getHostAddress();
+ }
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
old mode 100755
new mode 100644
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java
new file mode 100644
index 000000000..135dc40d7
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.utils;
+
+import us.codecraft.webmagic.Request;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/5
+ * Time: 下午4:58
+ */
+public abstract class RequestUtils {
+
+ private static Pattern p4Range = Pattern.compile("\\[(\\d+)\\-(\\d+)\\]");
+
+ public static List from(String exp){
+ Matcher matcher = p4Range.matcher(exp);
+ if (!matcher.find()) {
+ return Collections.singletonList(new Request(exp));
+ }
+ int rangeFrom = Integer.parseInt(matcher.group(1));
+ int rangeTo = Integer.parseInt(matcher.group(2));
+ if (rangeFrom > rangeTo) {
+ return Collections.emptyList();
+ }
+ List requests = new ArrayList(rangeTo - rangeFrom + 1);
+ for (int i = rangeFrom; i <= rangeTo; i++) {
+ requests.add(new Request(matcher.replaceAll(String.valueOf(i))));
+ }
+ return requests;
+ }
+
+}
diff --git a/webmagic-extension/src/main/resources/crawl.js b/webmagic-extension/src/main/resources/crawl.js
new file mode 100644
index 000000000..c9cf01cd1
--- /dev/null
+++ b/webmagic-extension/src/main/resources/crawl.js
@@ -0,0 +1,17 @@
+var system = require('system');
+var url = system.args[1];
+
+var page = require('webpage').create();
+page.settings.loadImages = false;
+page.settings.resourceTimeout = 5000;
+
+page.open(url, function (status) {
+ if (status != 'success') {
+ console.log("HTTP request failed!");
+ } else {
+ console.log(page.content);
+ }
+
+ page.close();
+ phantom.exit();
+});
\ No newline at end of file
diff --git a/webmagic-extension/src/main/resources/spider-config-draft.xml b/webmagic-extension/src/main/resources/spider-config-draft.xml
new file mode 100644
index 000000000..85aee4db1
--- /dev/null
+++ b/webmagic-extension/src/main/resources/spider-config-draft.xml
@@ -0,0 +1,29 @@
+
+
+
+ utf-8
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java
new file mode 100644
index 000000000..41a33cd17
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java
@@ -0,0 +1,89 @@
+package us.codecraft.webmagic;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import us.codecraft.webmagic.model.AfterExtractor;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/3
+ * Time: 下午2:54
+ */
+public class SimpleHttpClientTest {
+
+ public static class Weather implements AfterExtractor {
+
+ private String location;
+
+ @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/i/regex('([\\-\\d]+)',1)")
+ private Integer lowTemperature;
+
+ @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/span/regex('([\\-\\d]+)',1)")
+ private Integer highTemperature;
+
+ @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='wea']/text()")
+ private String desc;
+
+ @Override
+ public void afterProcess(Page page) {
+ if (lowTemperature > highTemperature) {
+ int temp = lowTemperature;
+ lowTemperature = highTemperature;
+ highTemperature = temp;
+ }
+ }
+
+ public String getLocation() {
+ return location;
+ }
+
+ public void setLocation(String location) {
+ this.location = location;
+ }
+
+ public Integer getLowTemperature() {
+ return lowTemperature;
+ }
+
+ public void setLowTemperature(Integer lowTemperature) {
+ this.lowTemperature = lowTemperature;
+ }
+
+ public Integer getHighTemperature() {
+ return highTemperature;
+ }
+
+ public void setHighTemperature(Integer highTemperature) {
+ this.highTemperature = highTemperature;
+ }
+
+ public String getDesc() {
+ return desc;
+ }
+
+ public void setDesc(String desc) {
+ this.desc = desc;
+ }
+
+ @Override
+ public String toString() {
+ return "Weather{" +
+ "location='" + location + '\'' +
+ ", lowTemperature=" + lowTemperature +
+ ", highTemperature=" + highTemperature +
+ ", desc='" + desc + '\'' +
+ '}';
+ }
+ }
+
+ @Ignore
+ @Test
+ public void test() throws Exception {
+ Weather weather = new SimpleHttpClient(Site.me()).get("http://www.weather.com.cn/weather/101020100.shtml", Weather.class);
+ assertThat(weather).isNotNull();
+ }
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
new file mode 100644
index 000000000..c2081dbf3
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
@@ -0,0 +1,38 @@
+package us.codecraft.webmagic.configurable;
+
+import org.junit.Test;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.downloader.MockGithubDownloader;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class ConfigurablePageProcessorTest {
+
+ @Test
+ public void test() throws Exception {
+ List extractRules = new ArrayList();
+ ExtractRule extractRule = new ExtractRule();
+ extractRule.setExpressionType(ExpressionType.XPath);
+ extractRule.setExpressionValue("//title");
+ extractRule.setFieldName("title");
+ extractRules.add(extractRule);
+ extractRule = new ExtractRule();
+ extractRule.setExpressionType(ExpressionType.XPath);
+ extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
+ extractRule.setFieldName("star");
+ extractRules.add(extractRule);
+ ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
+ .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
+ assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub ");
+ assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
+
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
deleted file mode 100644
index f73b34401..000000000
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package us.codecraft.webmagic.downloader;
-
-import org.junit.Ignore;
-import org.junit.Test;
-import us.codecraft.webmagic.Spider;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class FileCacheTest {
-
- @Ignore("takes long")
- @Test
- public void test() {
- FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
- Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
- }
-}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 6baee728a..bb18aa2c5 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -3,7 +3,6 @@
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
/**
@@ -937,7 +936,9 @@ public class MockGithubDownloader implements Downloader{
@Override
public Page download(Request request, Task task) {
Page page = new Page();
- page.setHtml(new Html(html));
+ page.setRawText(html);
+ page.setStatusCode(200);
+ page.setDownloadSuccess(true);
page.setRequest(new Request("https://github.com/code4craft/webmagic"));
page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
return page;
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java
index a621e2dcb..fa276cbbf 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java
@@ -1,10 +1,14 @@
package us.codecraft.webmagic.formatter;
+import org.apache.commons.lang3.time.DateFormatUtils;
+import org.apache.commons.lang3.time.DateUtils;
import org.junit.Test;
import us.codecraft.webmagic.model.formatter.DateFormatter;
import java.util.Date;
+import static org.assertj.core.api.Assertions.assertThat;
+
/**
* @author code4crafter@gmail.com
*/
@@ -13,8 +17,10 @@ public class DateFormatterTest {
@Test
public void testDateFormatter() throws Exception {
DateFormatter dateFormatter = new DateFormatter();
- dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"});
- Date format = dateFormatter.format("2013-09-10 22:11");
- System.out.println(format);
+ String pattern = "yyyy-MM-dd HH:mm";
+ Date date = DateUtils.parseDate("2013-09-10 22:11", new String[]{pattern});
+ dateFormatter.initParam(new String[]{pattern});
+ Date format = dateFormatter.format(DateFormatUtils.format(date, pattern));
+ assertThat(format).isEqualTo(date);
}
}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java
new file mode 100644
index 000000000..2d9cf94c8
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java
@@ -0,0 +1,12 @@
+package us.codecraft.webmagic.model;
+
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class BaseRepo {
+
+ @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()")
+ protected int star;
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java
new file mode 100644
index 000000000..d825a1fc4
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java
@@ -0,0 +1,32 @@
+package us.codecraft.webmagic.model;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.HelpUrl;
+import us.codecraft.webmagic.model.annotation.TargetUrl;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.3.2
+ */
+@TargetUrl("https://github.com/\\w+/\\w+")
+@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
+public class GithubRepo extends BaseRepo{
+
+ @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
+ private int fork;
+
+ public static void main(String[] args) {
+ OOSpider.create(Site.me().setSleepTime(100)
+ , new ConsolePageModelPipeline(), GithubRepo.class)
+ .addUrl("https://github.com/code4craft").thread(10).run();
+ }
+
+ public int getStar() {
+ return star;
+ }
+
+ public int getFork() {
+ return fork;
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java
new file mode 100644
index 000000000..37506451e
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java
@@ -0,0 +1,18 @@
+package us.codecraft.webmagic.model;
+
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/3
+ * Time: 下午9:07
+ */
+public class GithubRepoApi {
+
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name",source = ExtractBy.Source.RawText)
+ private String name;
+
+ public String getName() {
+ return name;
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java
index 85b6858d2..632dd8697 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java
@@ -1,13 +1,14 @@
package us.codecraft.webmagic.model;
-import junit.framework.Assert;
import org.junit.Test;
-import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.downloader.MockGithubDownloader;
import us.codecraft.webmagic.example.GithubRepo;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
+import static org.assertj.core.api.Assertions.assertThat;
+
/**
* @author code4crafter@gmail.com
*/
@@ -15,13 +16,14 @@ public class GithubRepoTest {
@Test
public void test() {
- OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
+ OOSpider.create(Site.me().setSleepTime(0)
, new PageModelPipeline() {
@Override
public void process(GithubRepo o, Task task) {
- Assert.assertEquals(86, o.getStar());
- Assert.assertEquals(70, o.getFork());
+ assertThat(o.getStar()).isEqualTo(86);
+ assertThat(o.getFork()).isEqualTo(70);
}
- }, GithubRepo.class).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
+ }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
+
}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
new file mode 100644
index 000000000..1014a45f5
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
@@ -0,0 +1,71 @@
+package us.codecraft.webmagic.model;
+
+import org.junit.Test;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.HelpUrl;
+import us.codecraft.webmagic.model.annotation.TargetUrl;
+import us.codecraft.webmagic.selector.PlainText;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class ModelPageProcessorTest {
+
+ private PageMocker pageMocker = new PageMocker();
+
+ @TargetUrl("http://codecraft.us/foo")
+ public static class ModelFoo {
+
+ @ExtractBy(value = "//div/@foo", notNull = true)
+ private String foo;
+
+ }
+
+ @TargetUrl("http://codecraft.us/bar")
+ public static class ModelBar {
+
+ @ExtractBy(value = "//div/@bar", notNull = true)
+ private String bar;
+
+ }
+
+ @TargetUrl(value = "http://webmagic.io/foo/\\d+",sourceRegion = "//li[@class='bar']")
+ @HelpUrl(value = "http://webmagic.io/bar/\\d+",sourceRegion = "//li[@class='foo']")
+ public static class MockModel {
+
+ }
+
+ @Test
+ public void testMultiModel_should_not_skip_when_match() throws Exception {
+ Page page = new Page();
+ page.setRawText("
");
+ page.setRequest(new Request("http://codecraft.us/foo"));
+ page.setUrl(PlainText.create("http://codecraft.us/foo"));
+ ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class);
+ modelPageProcessor.process(page);
+ assertThat(page.getResultItems().isSkip()).isFalse();
+ }
+
+ @Test
+ public void testExtractLinks() throws Exception {
+ ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class);
+ Page page = pageMocker.getMockPage();
+ modelPageProcessor.process(page);
+ assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/bar/3"), new Request("http://webmagic.io/bar/4"), new Request("http://webmagic.io/foo/3"), new Request("http://webmagic.io/foo/4"));
+ }
+
+ @Test
+ public void testExtractNoLinks() throws Exception {
+ ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class);
+ Page page = pageMocker.getMockPage();
+ modelPageProcessor.setExtractLinks(false);
+ modelPageProcessor.process(page);
+ assertThat(page.getTargetRequests()).isEmpty();
+ }
+
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java
new file mode 100644
index 000000000..45938d620
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java
@@ -0,0 +1,23 @@
+package us.codecraft.webmagic.model;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/3
+ * Time: 下午3:23
+ */
+public class PageMapperTest {
+
+ private PageMocker pageMocker = new PageMocker();
+
+ @Test
+ public void test_get() throws Exception {
+ PageMapper pageMapper = new PageMapper(GithubRepoApi.class);
+ GithubRepoApi githubRepo = pageMapper.get(pageMocker.getMockJsonPage());
+ assertThat(githubRepo.getName()).isEqualTo("webmagic");
+ }
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
new file mode 100644
index 000000000..0451edcfe
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.model;
+
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import org.apache.commons.io.IOUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.selector.PlainText;
+
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/3
+ * Time: 下午9:08
+ */
+public class PageMocker {
+
+ public Page getMockJsonPage() throws IOException {
+ Page page = new Page();
+ page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset()));
+ page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic"));
+ page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic"));
+ return page;
+ }
+
+ public Page getMockPage() throws IOException {
+ Page page = new Page();
+ page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset()));
+ page.setRequest(new Request("http://webmagic.io/list/0"));
+ page.setUrl(new PlainText("http://webmagic.io/list/0"));
+ return page;
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
new file mode 100644
index 000000000..f212628b4
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
@@ -0,0 +1,145 @@
+package us.codecraft.webmagic.model;
+
+import org.apache.commons.lang3.time.DateFormatUtils;
+import org.apache.commons.lang3.time.DateUtils;
+import org.junit.Test;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.ExtractByUrl;
+import us.codecraft.webmagic.model.annotation.Formatter;
+import us.codecraft.webmagic.model.formatter.DateFormatter;
+
+import java.util.Date;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/3
+ * Time: 下午9:06
+ */
+public class PageModelExtractorTest {
+
+ private PageMocker pageMocker = new PageMocker();
+
+ public static class ModelDateStr {
+
+ @ExtractBy(value = "//div[@class='date']/text()", notNull = true)
+ private String dateStr;
+
+ }
+
+ public static class ModelDate {
+
+ @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
+ @ExtractBy(value = "//div[@class='date']/text()", notNull = true)
+ private Date date;
+
+ }
+
+ public static class ModelInt {
+
+ @ExtractBy(value = "//div[@class='number']/text()", notNull = true)
+ private int number;
+
+ }
+
+ public static class ModelStringList {
+
+ @ExtractBy("//li[@class='list']/a/@href")
+ private List links;
+
+ }
+
+ public static class ModelIntList {
+
+ @Formatter(subClazz = Integer.class)
+ @ExtractBy("//li[@class='numbers']/text()")
+ private List numbers;
+
+ }
+
+ public static class ModelDateList {
+
+ @Formatter(subClazz = Date.class, value = "yyyyMMdd")
+ @ExtractBy("//li[@class='dates']/text()")
+ private List dates;
+
+ }
+
+ public static class ModelCustomList {
+
+ @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
+ @ExtractBy("//li[@class='dates']/text()")
+ private List dates;
+
+ }
+
+ public static class ModelJsonStr {
+
+ @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name")
+ private String name;
+
+ }
+
+ public static class ModelUrl {
+
+ @ExtractByUrl("https://api\\.github\\.com/repos/\\w+/(\\w+)")
+ private String name;
+
+ }
+
+ @Test
+ public void testXpath() throws Exception {
+ ModelDateStr modelDate = (ModelDateStr) PageModelExtractor.create(ModelDateStr.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.dateStr).isEqualTo("20170603");
+ }
+
+ @Test
+ public void testExtractDate() throws Exception {
+ ModelDate modelDate = (ModelDate) PageModelExtractor.create(ModelDate.class).process(pageMocker.getMockPage());
+ assertThat(DateFormatUtils.format(modelDate.date,"yyyyMMdd")).isEqualTo("20170603");
+ }
+
+ @Test
+ public void testExtractInt() throws Exception {
+ ModelInt modelDate = (ModelInt) PageModelExtractor.create(ModelInt.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.number).isEqualTo(12);
+ }
+
+ @Test
+ public void testExtractList() throws Exception {
+ ModelStringList modelDate = (ModelStringList) PageModelExtractor.create(ModelStringList.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.links).containsExactly("http://webmagic.io/list/1","http://webmagic.io/list/2","http://webmagic.io/list/3","http://webmagic.io/list/4");
+ }
+
+ @Test
+ public void testExtractIntList() throws Exception {
+ ModelIntList modelDate = (ModelIntList) PageModelExtractor.create(ModelIntList.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.numbers).containsExactly(1,2,3,4);
+ }
+
+ @Test
+ public void testExtractDateList() throws Exception {
+ ModelDateList modelDate = (ModelDateList) PageModelExtractor.create(ModelDateList.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd"));
+ }
+
+ @Test
+ public void testExtractCustomList() throws Exception {
+ ModelCustomList modelDate = (ModelCustomList) PageModelExtractor.create(ModelCustomList.class).process(pageMocker.getMockPage());
+ assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd"));
+ }
+
+ @Test
+ public void testExtractJson() throws Exception {
+ ModelJsonStr modelDate = (ModelJsonStr) PageModelExtractor.create(ModelJsonStr.class).process(pageMocker.getMockJsonPage());
+ assertThat(modelDate.name).isEqualTo("webmagic");
+ }
+
+ @Test
+ public void testExtractByUrl() throws Exception {
+ ModelUrl modelDate = (ModelUrl) PageModelExtractor.create(ModelUrl.class).process(pageMocker.getMockJsonPage());
+ assertThat(modelDate.name).isEqualTo("webmagic");
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java
new file mode 100644
index 000000000..75679daf3
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java
@@ -0,0 +1,19 @@
+package us.codecraft.webmagic.monitor;
+
+import us.codecraft.webmagic.Spider;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMXBean {
+
+ public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) {
+ super(spider, monitorSpiderListener);
+ }
+
+
+ @Override
+ public String getSchedulerName() {
+ return spider.getScheduler().getClass().getName();
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java
new file mode 100644
index 000000000..5dd8ace24
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java
@@ -0,0 +1,10 @@
+package us.codecraft.webmagic.monitor;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public interface CustomSpiderStatusMXBean extends SpiderStatusMXBean {
+
+ public String getSchedulerName();
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SeedUrlWithPortTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SeedUrlWithPortTest.java
new file mode 100644
index 000000000..2ba5f247d
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SeedUrlWithPortTest.java
@@ -0,0 +1,35 @@
+package us.codecraft.webmagic.monitor;
+
+import org.junit.Test;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import javax.management.JMException;
+
+/**
+ * @author jerry_shenchao@163.com
+ */
+public class SeedUrlWithPortTest {
+
+ @Test
+ public void testSeedUrlWithPort() throws JMException {
+ Spider spider = Spider.create(new TempProcessor()).addUrl("http://www.hndpf.org:8889/");
+ SpiderMonitor.instance().register(spider);
+ spider.run();
+ }
+}
+
+class TempProcessor implements PageProcessor {
+
+ @Override
+ public void process(Page page) {
+
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me();
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java
new file mode 100644
index 000000000..6144da7e9
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java
@@ -0,0 +1,31 @@
+package us.codecraft.webmagic.monitor;
+
+import org.junit.Test;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor;
+import us.codecraft.webmagic.processor.example.ZhihuPageProcessor;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.0
+ */
+public class SpiderMonitorTest {
+
+ @Test
+ public void testInherit() throws Exception {
+ SpiderMonitor spiderMonitor = new SpiderMonitor(){
+ @Override
+ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
+ return new CustomSpiderStatus(spider, monitorSpiderListener);
+ }
+ };
+
+ Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
+ .addUrl("http://my.oschina.net/flashsword/blog").thread(2);
+ Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
+ .addUrl("https://github.com/code4craft");
+
+ spiderMonitor.register(zhihuSpider, githubSpider);
+
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
index bf9e381d0..1c8742c81 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
@@ -19,7 +19,7 @@ public void process(Page page) {
@Override
public Site getSite() {
- return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
+ return Site.me();
}
@Test
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
new file mode 100644
index 000000000..39c2b6a55
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
@@ -0,0 +1,79 @@
+package us.codecraft.webmagic.scheduler;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class BloomFilterDuplicateRemoverTest {
+
+ @Test
+ public void testRemove() throws Exception {
+ BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
+ boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
+ assertThat(isDuplicate).isFalse();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
+ assertThat(isDuplicate).isTrue();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
+ assertThat(isDuplicate).isFalse();
+ isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
+ assertThat(isDuplicate).isTrue();
+
+ }
+
+ @Ignore("long time")
+ @Test
+ public void testMemory() throws Exception {
+ int times = 5000000;
+ DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
+ long freeMemory = Runtime.getRuntime().freeMemory();
+ long time = System.currentTimeMillis();
+ for (int i = 0; i < times; i++) {
+ duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ }
+ System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
+ System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
+
+ duplicateRemover = new HashSetDuplicateRemover();
+ System.gc();
+ freeMemory = Runtime.getRuntime().freeMemory();
+ time = System.currentTimeMillis();
+ for (int i = 0; i < times; i++) {
+ duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ }
+ System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
+ System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
+ }
+
+ @Ignore("long time")
+ @Test
+ public void testMissHit() throws Exception {
+ int times = 5000000;
+ DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
+ int right = 0;
+ int wrong = 0;
+ int missCheck = 0;
+ for (int i = 0; i < times; i++) {
+ boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ if (duplicate) {
+ wrong++;
+ } else {
+ right++;
+ }
+ duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
+ if (!duplicate) {
+ missCheck++;
+ }
+ }
+
+ System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
+ }
+
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisPrioritySchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisPrioritySchedulerTest.java
new file mode 100644
index 000000000..15bd939f7
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisPrioritySchedulerTest.java
@@ -0,0 +1,70 @@
+package us.codecraft.webmagic.scheduler;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Task;
+
+/**
+ * @author sai
+ * Created by sai on 16-7-5.
+ */
+public class RedisPrioritySchedulerTest
+{
+
+ private RedisPriorityScheduler scheduler;
+
+ @Before
+ public void setUp()
+ {
+ scheduler = new RedisPriorityScheduler("localhost");
+ }
+
+ @Ignore("environment depended")
+ @Test
+ public void test()
+ {
+ Task task = new Task() {
+ @Override
+ public String getUUID() {
+ return "TestTask";
+ }
+
+ @Override
+ public Site getSite() {
+ return null;
+ }
+ };
+
+ scheduler.resetDuplicateCheck(task);
+
+ Request request = new Request("https://www.google.com");
+ Request request1= new Request("https://www.facebook.com/");
+ Request request2= new Request("https://twitter.com");
+
+ request.setPriority(1).putExtra("name", "google");
+ request1.setPriority(0).putExtra("name", "facebook");
+ request2.setPriority(-1).putExtra("name", "twitter");
+
+ scheduler.push(request, task);
+ scheduler.push(request1, task);
+ scheduler.push(request2, task);
+
+ Request GRequest = scheduler.poll(task);
+ Request FBRequest = scheduler.poll(task);
+ Request TRequest = scheduler.poll(task);
+
+ Assert.assertEquals(GRequest.getUrl(), request.getUrl());
+ Assert.assertEquals(GRequest.getExtra("name"), request.getExtra("name"));
+
+ Assert.assertEquals(FBRequest.getUrl(), request1.getUrl());
+ Assert.assertEquals(FBRequest.getExtra("name"), request.getExtra("name"));
+
+ Assert.assertEquals(TRequest.getUrl(), request2.getUrl());
+ Assert.assertEquals(TRequest.getExtra("name"), request.getExtra("name"));
+ }
+
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
index 151876359..b4124d2d9 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
@@ -7,6 +7,8 @@
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
+import static org.assertj.core.api.Assertions.assertThat;
+
/**
* @author code4crafter@gmail.com
*/
@@ -37,7 +39,7 @@ public Site getSite() {
request.putExtra("1","2");
redisScheduler.push(request, task);
Request poll = redisScheduler.poll(task);
- System.out.println(poll);
+ assertThat(poll).isEqualTo(request);
}
}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java
new file mode 100644
index 000000000..9d78fb9ac
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java
@@ -0,0 +1,14 @@
+package us.codecraft.webmagic.utils;
+
+import org.junit.Test;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class IPUtilsTest {
+
+ @Test
+ public void testGetFirstNoLoopbackIPAddresses() throws Exception {
+ System.out.println(IPUtils.getFirstNoLoopbackIPAddresses());
+ }
+}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java
new file mode 100644
index 000000000..ec8486483
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java
@@ -0,0 +1,28 @@
+package us.codecraft.webmagic.utils;
+
+import org.junit.Test;
+import us.codecraft.webmagic.Request;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 2017/6/5
+ * Time: 下午5:08
+ */
+public class RequestUtilsTest {
+
+ @Test
+ public void test_generate_range() throws Exception {
+ List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[1-3]&s=20");
+ assertThat(requests).containsExactly(new Request("http://angularjs.cn/api/article/latest?p=1&s=20"), new Request("http://angularjs.cn/api/article/latest?p=2&s=20"), new Request("http://angularjs.cn/api/article/latest?p=3&s=20"));
+ }
+
+ @Test
+ public void test_generate_range_when_invalid_number() throws Exception {
+ List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[10-3]&s=20");
+ assertThat(requests).isEmpty();
+ }
+}
diff --git a/webmagic-extension/src/test/resouces/log4j.xml b/webmagic-extension/src/test/resouces/log4j.xml
deleted file mode 100644
index a58e889b9..000000000
--- a/webmagic-extension/src/test/resouces/log4j.xml
+++ /dev/null
@@ -1,31 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-extension/src/test/resources/html/mock-github.html b/webmagic-extension/src/test/resources/html/mock-github.html
new file mode 100644
index 000000000..df53d870a
--- /dev/null
+++ b/webmagic-extension/src/test/resources/html/mock-github.html
@@ -0,0 +1,1580 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ code4craft/webmagic
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Skip to content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Java
+ CSS
+ JavaScript
+ FreeMarker
+ HTML
+ Ruby
+
+
+
+
+
+
+
+
+
+
+
+
+ New pull request
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Permalink
+
+
+
+
+
+
+
+ Failed to load latest commit information.
+
+
+
+
+
+
+
+
+ assets
+
+
+
+ 同步官方源码
+
+
+
+ Apr 12, 2014
+
+
+
+
+
+
+
+
+ en_docs
+
+
+
+ docs
+
+
+
+ May 3, 2014
+
+
+
+
+
+
+
+
+ webmagic-avalon
+
+
+
+ update version to snapshot
+
+
+
+ May 5, 2014
+
+
+
+
+
+
+
+
+ webmagic-core
+
+
+
+ 修正FileCacheQueueScheduler导致程序不能正常结束和未关闭流
+
+
+
+ Nov 12, 2015
+
+
+
+
+
+
+
+
+ webmagic-extension
+
+
+
+ Merge pull request #237 from SpenceZhou/master
+
+
+
+ Dec 2, 2015
+
+
+
+
+
+
+
+
+ webmagic-samples
+
+
+
+ Merge pull request #227 from hsqlu/master
+
+
+
+ Jan 16, 2016
+
+
+
+
+
+
+
+
+ webmagic-saxon
+
+
+
+ update version
+
+
+
+ Jun 4, 2014
+
+
+
+
+
+
+
+
+ webmagic-scripts
+
+
+
+ update version
+
+
+
+ Jun 4, 2014
+
+
+
+
+
+
+
+
+ webmagic-selenium
+
+
+
+ update and validate pom.xml
+
+
+
+ Jul 11, 2015
+
+
+
+
+
+
+
+
+ zh_docs
+
+
+
+ contributor
+
+
+
+ Jun 4, 2014
+
+
+
+
+
+
+
+
+ .gitignore
+
+
+
+ change_gitignore
+
+
+
+ May 19, 2014
+
+
+
+
+
+
+
+
+ .travis.yml
+
+
+
+ remove ci for jdk6
+
+
+
+ Jan 18, 2016
+
+
+
+
+
+
+
+
+ README.md
+
+
+
+ contributor
+
+
+
+ Jun 4, 2014
+
+
+
+
+
+
+
+
+ pom.xml
+
+
+
+ Revert "remove some unkown config"
+
+
+
+ Jan 18, 2016
+
+
+
+
+
+
+
+
+ release-note.md
+
+
+
+ #34 Close reader in FileCacheQueueScheduler
+
+
+
+ Nov 8, 2013
+
+
+
+
+
+
+
+
+ user-manual.md
+
+
+
+ deperate in user manual
+
+
+
+ May 3, 2014
+
+
+
+
+
+
+
+
+ webmagic-avalon.md
+
+
+
+ scripts readme
+
+
+
+ Nov 28, 2013
+
+
+
+
+
+
+
+
+
+
+
+
+ README.md
+
+
+
+
+ Readme in Chinese
+
+ User Manual (Chinese)
+
+
+
+
+ A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
+
+
+ Features:
+
+
+ Simple core with high flexibility.
+ Simple API for html extracting.
+ Annotation with POJO to customize a crawler, no configuration.
+ Multi-thread and Distribution support.
+ Easy to be integrated.
+
+
+ Install:
+
+ Add dependencies to your pom.xml:
+
+ <dependency >
+ <groupId >us.codecraft</groupId >
+ <artifactId >webmagic-core</artifactId >
+ <version >0.5.2</version >
+</dependency >
+<dependency >
+ <groupId >us.codecraft</groupId >
+ <artifactId >webmagic-extension</artifactId >
+ <version >0.5.2</version >
+</dependency >
+
+ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.
+
+ <exclusions >
+ <exclusion >
+ <groupId >org.slf4j</groupId >
+ <artifactId >slf4j-log4j12</artifactId >
+ </exclusion >
+</exclusions >
+
+ Get Started:
+
+ First crawler:
+
+ Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation.
+
+ public class GithubRepoPageProcessor implements PageProcessor {
+
+ private Site site = Site . me(). setRetryTimes(3 ). setSleepTime(1000 );
+
+ @Override
+ public void process (Page page ) {
+ page. addTargetRequests(page. getHtml(). links(). regex(" (https://github\\ .com/\\ w+/\\ w+)" ). all());
+ page. putField(" author" , page. getUrl(). regex(" https://github\\ .com/(\\ w+)/.*" ). toString());
+ page. putField(" name" , page. getHtml(). xpath(" //h1[@class='entry-title public']/strong/a/text()" ). toString());
+ if (page. getResultItems(). get(" name" )== null ){
+ //skip this page
+ page. setSkip(true );
+ }
+ page. putField(" readme" , page. getHtml(). xpath(" //div[@id='readme']/tidyText()" ));
+ }
+
+ @Override
+ public Site getSite () {
+ return site;
+ }
+
+ public static void main (String [] args ) {
+ Spider . create(new GithubRepoPageProcessor ()). addUrl(" https://github.com/code4craft" ). thread(5 ). run();
+ }
+}
+
+
+
+ You can also use annotation way:
+
+ @TargetUrl(" https://github.com/\\ w+/\\ w+" )
+@HelpUrl(" https://github.com/\\ w+" )
+public class GithubRepo {
+
+ @ExtractBy (value = " //h1[@class='entry-title public']/strong/a/text()" , notNull = true )
+ private String name;
+
+ @ExtractByUrl (" https://github\\ .com/(\\ w+)/.*" )
+ private String author;
+
+ @ExtractBy (" //div[@id='readme']/tidyText()" )
+ private String readme;
+
+ public static void main (String [] args ) {
+ OOSpider . create(Site . me(). setSleepTime(1000 )
+ , new ConsolePageModelPipeline (), GithubRepo . class)
+ .addUrl(" https://github.com/code4craft" ). thread(5 ). run();
+ }
+}
+
+ Docs and samples:
+
+ Documents: http://webmagic.io/docs/
+
+ The architecture of webmagic (refered to Scrapy )
+
+
+
+ Javadocs: http://code4craft.github.io/webmagic/docs/en/
+
+ There are some samples in webmagic-samples package.
+
+ Lisence:
+
+ Lisenced under Apache 2.0 lisence
+
+ Contributors:
+
+ Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
+
+
+
+ Thanks:
+
+ To write webmagic, I refered to the projects below :
+
+
+
+ Mail-list:
+
+ https://groups.google.com/forum/#!forum/webmagic-java
+
+ http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988
+
+ QQ Group: 373225642
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Something went wrong with that request. Please try again.
+
+
+
+
+
+
+
+
+
+
+
You signed in with another tab or window. Reload to refresh your session.
+
You signed out in another tab or window. Reload to refresh your session.
+
+
+
+
+
+
+
+
diff --git a/webmagic-extension/src/test/resources/html/mock-webmagic.html b/webmagic-extension/src/test/resources/html/mock-webmagic.html
new file mode 100644
index 000000000..351ec9762
--- /dev/null
+++ b/webmagic-extension/src/test/resources/html/mock-webmagic.html
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+
+20170603
+12
+
+
+
+
+
+
+
+ 20170601
+ 20170602
+ 20170603
+ 20170604
+
+
+
\ No newline at end of file
diff --git a/webmagic-extension/src/test/resources/json/mock-githubrepo.json b/webmagic-extension/src/test/resources/json/mock-githubrepo.json
new file mode 100644
index 000000000..a5037d3d1
--- /dev/null
+++ b/webmagic-extension/src/test/resources/json/mock-githubrepo.json
@@ -0,0 +1,91 @@
+{
+ "id": 9623064,
+ "name": "webmagic",
+ "full_name": "code4craft/webmagic",
+ "owner": {
+ "login": "code4craft",
+ "id": 1351884,
+ "avatar_url": "https://avatars0.githubusercontent.com/u/1351884?v=3",
+ "gravatar_id": "",
+ "url": "https://api.github.com/users/code4craft",
+ "html_url": "https://github.com/code4craft",
+ "followers_url": "https://api.github.com/users/code4craft/followers",
+ "following_url": "https://api.github.com/users/code4craft/following{/other_user}",
+ "gists_url": "https://api.github.com/users/code4craft/gists{/gist_id}",
+ "starred_url": "https://api.github.com/users/code4craft/starred{/owner}{/repo}",
+ "subscriptions_url": "https://api.github.com/users/code4craft/subscriptions",
+ "organizations_url": "https://api.github.com/users/code4craft/orgs",
+ "repos_url": "https://api.github.com/users/code4craft/repos",
+ "events_url": "https://api.github.com/users/code4craft/events{/privacy}",
+ "received_events_url": "https://api.github.com/users/code4craft/received_events",
+ "type": "User",
+ "site_admin": false
+ },
+ "private": false,
+ "html_url": "https://github.com/code4craft/webmagic",
+ "description": "A scalable web crawler framework for Java.",
+ "fork": false,
+ "url": "https://api.github.com/repos/code4craft/webmagic",
+ "forks_url": "https://api.github.com/repos/code4craft/webmagic/forks",
+ "keys_url": "https://api.github.com/repos/code4craft/webmagic/keys{/key_id}",
+ "collaborators_url": "https://api.github.com/repos/code4craft/webmagic/collaborators{/collaborator}",
+ "teams_url": "https://api.github.com/repos/code4craft/webmagic/teams",
+ "hooks_url": "https://api.github.com/repos/code4craft/webmagic/hooks",
+ "issue_events_url": "https://api.github.com/repos/code4craft/webmagic/issues/events{/number}",
+ "events_url": "https://api.github.com/repos/code4craft/webmagic/events",
+ "assignees_url": "https://api.github.com/repos/code4craft/webmagic/assignees{/user}",
+ "branches_url": "https://api.github.com/repos/code4craft/webmagic/branches{/branch}",
+ "tags_url": "https://api.github.com/repos/code4craft/webmagic/tags",
+ "blobs_url": "https://api.github.com/repos/code4craft/webmagic/git/blobs{/sha}",
+ "git_tags_url": "https://api.github.com/repos/code4craft/webmagic/git/tags{/sha}",
+ "git_refs_url": "https://api.github.com/repos/code4craft/webmagic/git/refs{/sha}",
+ "trees_url": "https://api.github.com/repos/code4craft/webmagic/git/trees{/sha}",
+ "statuses_url": "https://api.github.com/repos/code4craft/webmagic/statuses/{sha}",
+ "languages_url": "https://api.github.com/repos/code4craft/webmagic/languages",
+ "stargazers_url": "https://api.github.com/repos/code4craft/webmagic/stargazers",
+ "contributors_url": "https://api.github.com/repos/code4craft/webmagic/contributors",
+ "subscribers_url": "https://api.github.com/repos/code4craft/webmagic/subscribers",
+ "subscription_url": "https://api.github.com/repos/code4craft/webmagic/subscription",
+ "commits_url": "https://api.github.com/repos/code4craft/webmagic/commits{/sha}",
+ "git_commits_url": "https://api.github.com/repos/code4craft/webmagic/git/commits{/sha}",
+ "comments_url": "https://api.github.com/repos/code4craft/webmagic/comments{/number}",
+ "issue_comment_url": "https://api.github.com/repos/code4craft/webmagic/issues/comments{/number}",
+ "contents_url": "https://api.github.com/repos/code4craft/webmagic/contents/{+path}",
+ "compare_url": "https://api.github.com/repos/code4craft/webmagic/compare/{base}...{head}",
+ "merges_url": "https://api.github.com/repos/code4craft/webmagic/merges",
+ "archive_url": "https://api.github.com/repos/code4craft/webmagic/{archive_format}{/ref}",
+ "downloads_url": "https://api.github.com/repos/code4craft/webmagic/downloads",
+ "issues_url": "https://api.github.com/repos/code4craft/webmagic/issues{/number}",
+ "pulls_url": "https://api.github.com/repos/code4craft/webmagic/pulls{/number}",
+ "milestones_url": "https://api.github.com/repos/code4craft/webmagic/milestones{/number}",
+ "notifications_url": "https://api.github.com/repos/code4craft/webmagic/notifications{?since,all,participating}",
+ "labels_url": "https://api.github.com/repos/code4craft/webmagic/labels{/name}",
+ "releases_url": "https://api.github.com/repos/code4craft/webmagic/releases{/id}",
+ "deployments_url": "https://api.github.com/repos/code4craft/webmagic/deployments",
+ "created_at": "2013-04-23T12:57:36Z",
+ "updated_at": "2017-06-03T03:58:13Z",
+ "pushed_at": "2017-06-03T07:10:15Z",
+ "git_url": "git://github.com/code4craft/webmagic.git",
+ "ssh_url": "git@github.com:code4craft/webmagic.git",
+ "clone_url": "https://github.com/code4craft/webmagic.git",
+ "svn_url": "https://github.com/code4craft/webmagic",
+ "homepage": "http://webmagic.io/",
+ "size": 16982,
+ "stargazers_count": 4566,
+ "watchers_count": 4566,
+ "language": "Java",
+ "has_issues": true,
+ "has_projects": true,
+ "has_downloads": true,
+ "has_wiki": true,
+ "has_pages": true,
+ "forks_count": 2432,
+ "mirror_url": null,
+ "open_issues_count": 96,
+ "forks": 2432,
+ "open_issues": 96,
+ "watchers": 4566,
+ "default_branch": "master",
+ "network_count": 2432,
+ "subscribers_count": 618
+}
diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml
new file mode 100644
index 000000000..86aee5f59
--- /dev/null
+++ b/webmagic-extension/src/test/resources/log4j2-test.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-lucene/README.md b/webmagic-lucene/README.md
deleted file mode 100644
index 77050ab08..000000000
--- a/webmagic-lucene/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-webmagic-lucene
---------
-尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。
\ No newline at end of file
diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml
deleted file mode 100644
index d8b8bc9f4..000000000
--- a/webmagic-lucene/pom.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-
- webmagic-parent
- us.codecraft
- 0.5.0-SNAPSHOT
-
- 4.0.0
-
- webmagic-lucene
-
-
-
- org.apache.lucene
- lucene-analyzers-common
- 4.4.0
-
-
- org.apache.lucene
- lucene-queryparser
- 4.4.0
-
-
- us.codecraft
- webmagic-extension
- ${project.version}
-
-
- junit
- junit
-
-
-
-
-
-
- maven-deploy-plugin
-
- true
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java
deleted file mode 100644
index 6fe270210..000000000
--- a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java
+++ /dev/null
@@ -1,92 +0,0 @@
-package us.codecraft.webmagic.pipeline;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.queryparser.classic.ParseException;
-import org.apache.lucene.queryparser.classic.QueryParser;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
-import us.codecraft.webmagic.ResultItems;
-import us.codecraft.webmagic.Task;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-/**
- * @author code4crafter@gmail.com
- * Date: 13-8-5
- * Time: 下午2:11
- */
-public class LucenePipeline implements Pipeline {
-
- private Directory directory;
-
- private Analyzer analyzer;
-
- private IndexWriterConfig config;
-
- private void init() throws IOException {
- analyzer = new StandardAnalyzer(Version.LUCENE_44);
- directory = new RAMDirectory();
- config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
- }
-
- public LucenePipeline() {
- try {
- init();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public List search(String fieldName, String value) throws IOException, ParseException {
- List documents = new ArrayList();
- DirectoryReader ireader = DirectoryReader.open(directory);
- IndexSearcher isearcher = new IndexSearcher(ireader);
- // Parse a simple query that searches for "text":
- QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
- Query query = parser.parse(value);
- ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
- // Iterate through the results:
- for (int i = 0; i < hits.length; i++) {
- Document hitDoc = isearcher.doc(hits[i].doc);
- documents.add(hitDoc);
- }
- ireader.close();
- return documents;
- }
-
- @Override
- public void process(ResultItems resultItems, Task task) {
- if (resultItems.isSkip()){
- return;
- }
- Document doc = new Document();
- Map all = resultItems.getAll();
- if (all==null){
- return;
- }
- for (Map.Entry objectEntry : all.entrySet()) {
- doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
- }
- try {
- IndexWriter indexWriter = new IndexWriter(directory, config);
- indexWriter.addDocument(doc);
- indexWriter.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-}
diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java
deleted file mode 100644
index b35037024..000000000
--- a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package us.codecraft.webmagic.lucene;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.queryparser.classic.ParseException;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.model.annotation.ExtractBy;
-import us.codecraft.webmagic.model.OOSpider;
-import us.codecraft.webmagic.model.annotation.TargetUrl;
-import us.codecraft.webmagic.pipeline.LucenePipeline;
-
-import java.io.IOException;
-import java.util.List;
-
-/**
- * @author code4crafter@gmail.com
- * Date: 13-8-2
- * Time: 上午7:52
- */
-@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
-public class OschinaBlog {
-
- @ExtractBy("//title")
- private String title;
-
- @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
- private String content;
-
- @Override
- public String toString() {
- return "OschinaBlog{" +
- "title='" + title + '\'' +
- ", content='" + content + '\'' +
- '}';
- }
-
- public static void main(String[] args) {
- LucenePipeline pipeline = new LucenePipeline();
- OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
- while (true) {
- try {
- List search = pipeline.search("title", "webmagic");
- System.out.println(search);
- Thread.sleep(3000);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ParseException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getContent() {
- return content;
- }
-}
diff --git a/webmagic-panel/README.md b/webmagic-panel/README.md
deleted file mode 100644
index 30ddd132c..000000000
--- a/webmagic-panel/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-Worker:
-
-任务执行者,提供Http接口,监控运行状态,终止和开始job
-
-队列:
-
-仍然使用redis
-
-Panel:
-
-提供Web管理后台,管理
-
-
-
-1. 新建任务
- 1. 通过脚本
- 2. 配置
- 3. 分配机器
-2. 已有任务
-3. 任务查看
\ No newline at end of file
diff --git a/webmagic-panel/pom.xml b/webmagic-panel/pom.xml
deleted file mode 100644
index 3b0b6823b..000000000
--- a/webmagic-panel/pom.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-
-
-
- webmagic-parent
- us.codecraft
- 0.5.0-SNAPSHOT
-
- 4.0.0
-
- us.codecraft
- webmagic-panel
-
-
-
- us.codecraft
- webmagic-scripts
- ${project.version}
-
-
-
-
-
-
- maven-deploy-plugin
-
- true
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/webmagic-samples/README.md b/webmagic-samples/README.md
index 7cdad186f..0656ae623 100644
--- a/webmagic-samples/README.md
+++ b/webmagic-samples/README.md
@@ -1,3 +1,3 @@
webmagic-samples
-------
-webmagic的一些示例。包括抓取常见博客、信息类网站等。
\ No newline at end of file
+webmagic的一些示例。包括抓取常见 博客、信息类网站等。
\ No newline at end of file
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index c2b4b939f..50e79c73e 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -1,11 +1,14 @@
-
+
- webmagic-parent
us.codecraft
- 0.5.0-SNAPSHOT
+ webmagic
+ 1.0.3
4.0.0
@@ -13,44 +16,35 @@
- us.codecraft
+ ${project.groupId}
webmagic-core
${project.version}
- us.codecraft
+ ${project.groupId}
webmagic-extension
${project.version}
- junit
- junit
+ org.mapdb
+ mapdb
+ 3.1.0
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ 2.15.2
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ 2.15.2
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.16.0
-
-
-
- maven-deploy-plugin
-
- true
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 2.4
-
-
-
- true
- ./lib/
- us.codecraft.webmagic.main.QuickStarter
-
-
-
-
-
-
-
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
index 074dd0f48..61083d693 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
@@ -38,7 +38,7 @@ public static void main(String[] args) {
key = readKey(key);
System.out.println("The demo started and will last 20 seconds...");
//Start spider
- OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).runAsync();
+ OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync();
try {
Thread.sleep(20000);
@@ -57,7 +57,7 @@ private static String readKey(String key) {
System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey()));
}
while (key == null) {
- key = new String(stdin.nextLine());
+ key = stdin.nextLine();
if (clazzMap.get(key) == null) {
System.out.println("Invalid choice!");
key = null;
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java
new file mode 100644
index 000000000..e83d9442a
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java
@@ -0,0 +1,42 @@
+package us.codecraft.webmagic.model.samples;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class BaiduNews {
+
+ @ExtractBy("//h3[@class='c-title']/a/text()")
+ private String name;
+
+ @ExtractBy("//div[@class='c-summary']/text()")
+ private String description;
+
+ @Override
+ public String toString() {
+ return "BaiduNews{" +
+ "name='" + name + '\'' +
+ ", description='" + description + '\'' +
+ '}';
+ }
+
+ public static void main(String[] args) {
+ OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class);
+ //single download
+ BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient");
+ System.out.println(baike);
+
+ ooSpider.close();
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+}
\ No newline at end of file
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
new file mode 100644
index 000000000..77def20e4
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.model.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.model.AfterExtractor;
+import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.TargetUrl;
+
+import java.util.List;
+
+/**
+ * @author yihua.huang@dianping.com
+ * Date: 13-8-13
+ * Time: 上午10:13
+ */
+@TargetUrl("http://*.alpha.dp/*")
+public class DianpingFtlDataScanner implements AfterExtractor {
+
+ @ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
+ private List data;
+
+ public static void main(String[] args) {
+ OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class)
+ .thread(5).run();
+ }
+
+ @Override
+ public void afterProcess(Page page) {
+ if (data.size() > 1) {
+ System.err.println(page.getUrl());
+ }
+ if (data.size() > 0 && data.get(0).length() > 100) {
+ System.err.println(page.getUrl());
+ }
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
index e8998eca0..136e88d9e 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
@@ -41,14 +41,15 @@ public class GithubRepo implements HasKey {
private String url;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
+ OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3),
new JsonFilePageModelPipeline(), GithubRepo.class)
- .scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
+ .addUrl("https://github.com/explore")
+ .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
}
@Override
public String key() {
- return author+":"+name;
+ return author+"_"+name;
}
public String getName() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
index 7e3dc516f..6a10f47ba 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
@@ -28,7 +28,7 @@ public String toString() {
}
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
+ OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run();
}
public String getTitle() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
index 936f1329c..a1cc54573 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
@@ -1,14 +1,19 @@
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
+import javax.management.JMException;
+import java.io.IOException;
+
/**
* @author code4crafter@gmail.com
*/
@@ -25,14 +30,17 @@ public class Kr36NewsModel {
@ExtractByUrl
private String url;
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException, JMException {
//Just for benchmark
- OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() {
+ Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
@Override
public void process(Object o, Task task) {
}
- },Kr36NewsModel.class).thread(20).run();
+ }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
+ thread.start();
+ SpiderMonitor spiderMonitor = SpiderMonitor.instance();
+ spiderMonitor.register(thread);
}
public String getTitle() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
index e9dfb2636..45bee2f4c 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
@@ -3,7 +3,6 @@
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
-import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
@@ -26,9 +25,8 @@ public class News163 implements MultiPageModel {
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
- @ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"),
- @ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)},
- multi = true, notNull = false)
+ @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)"
+ , multi = true, notNull = false)
private List otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
@@ -74,8 +72,8 @@ public String toString() {
}
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
- .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run();
+ OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
+ .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
index 112f86a69..cd93093ec 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
@@ -22,7 +22,7 @@ public class OschinaAnswer implements AfterExtractor{
private String content;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
+ OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run();
}
@Override
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
index 468b855be..286e6f5b9 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
@@ -26,7 +26,7 @@ public class OschinaBlog{
public static void main(String[] args) {
OOSpider.create(Site.me()
- .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").addStartUrl("http://my.oschina.net/flashsword/blog")
+ .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
.setSleepTime(0)
.setRetryTimes(3)
,new PageModelPipeline() {
@@ -34,7 +34,7 @@ public static void main(String[] args) {
public void process(Object o, Task task) {
}
- }, OschinaBlog.class).thread(10).run();
+ }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
}
public String getTitle() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java
new file mode 100644
index 000000000..8120e3556
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java
@@ -0,0 +1,26 @@
+package us.codecraft.webmagic.model.samples;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.model.ConsolePageModelPipeline;
+import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.TargetUrl;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
+@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
+public class QQMeishi {
+
+ @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()")
+ private String shopName;
+
+ @ExtractBy("//div[@class=info]/a[@class=title]/text()")
+ private String promo;
+
+ public static void main(String[] args) {
+ OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run();
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
new file mode 100644
index 000000000..bee80e775
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
@@ -0,0 +1,78 @@
+package us.codecraft.webmagic.recover;
+
+import com.google.common.base.Charsets;
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * @author :linweisen
+ */
+public class DuplicateStorageRemover implements DuplicateRemover {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "duplicate";
+
+ private IndexTreeList urlDuplicateQueue;
+
+ private BloomFilter bloomFilter;
+
+ private AtomicInteger counter;
+
+ public DuplicateStorageRemover(String path) {
+
+ String duplicatStoragePath = path;
+
+ DB db = DBMaker.fileDB(duplicatStoragePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+
+ this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
+
+ counter = new AtomicInteger(this.urlDuplicateQueue.size());
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ for (String url : this.urlDuplicateQueue){
+ bloomFilter.put(url);
+ }
+
+ }
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ String url = request.getUrl();
+ boolean isDuplicate = bloomFilter.mightContain(url);
+ if (!isDuplicate) {
+ bloomFilter.put(url);
+ urlDuplicateQueue.add(url);
+ this.db.commit();
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ this.urlDuplicateQueue.clear();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
new file mode 100644
index 000000000..4cee18afd
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.recover;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.IOException;
+
+/**
+ * @author :linweisen
+ */
+public class MmapQueueScheduler extends DuplicateRemovedScheduler {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "queue";
+
+ private IndexTreeList queue;
+
+ private static ObjectMapper mapper;
+
+ public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
+ super.setDuplicateRemover(duplicateRemover);
+
+ String queuePath = path;
+
+ DB db = DBMaker.fileDB(queuePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+ this.mapper = new ObjectMapper();
+ this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
+ }
+
+ @Override
+ public Request poll(Task task) {
+ if (this.queue.size() > 0){
+ String s = queue.remove(0);
+ return fromJson(s, Request.class);
+ }else{
+ return null;
+ }
+
+ }
+
+ @Override
+ public void pushWhenNoDuplicate(Request request, Task task) {
+ queue.add(toJson(request));
+ this.db.commit();
+ }
+
+ public String toJson(Object object) {
+ try {
+ return mapper.writeValueAsString(object);
+ } catch (IOException e) {
+ logger.warn("write to json string error:" + object, e);
+ return null;
+ }
+ }
+
+ public T fromJson(String jsonString, Class clazz) {
+ if (StringUtils.isEmpty(jsonString)) {
+ return null;
+ }
+ try {
+ return mapper.readValue(jsonString, clazz);
+ } catch (IOException e) {
+ logger.warn("parse json string error:" + jsonString, e);
+ return null;
+ }
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
new file mode 100644
index 000000000..4fb91a0d2
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.recover;
+
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.samples.SinaBlogProcessor;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class RecoverSample {
+
+ public static void main(String[] args) {
+ String storage = "queue";
+ String duplicate = "duplicate";
+ Spider spider = new Spider(new SinaBlogProcessor());
+ DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
+ spider.setScheduler(new MmapQueueScheduler(remover, storage));
+ spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
+ .run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java
new file mode 100644
index 000000000..af9c01e90
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java
@@ -0,0 +1,65 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.scheduler.PriorityScheduler;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class AlexanderMcqueenGoodsProcessor implements PageProcessor {
+
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+
+ public static final String URL_LIST = "http://www\\.alexandermcqueen\\.cn/.*";
+
+ public static final String URL_POST = "http://www\\.alexandermcqueen\\.cn/cn/\\w+/.*\\.html";
+
+ @Override
+ public void process(Page page) {
+ if (page.getUrl().regex(URL_POST).match()) {
+ page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
+ if (page.getResultItems().get("goodsName") == null) {
+ page.setSkip(true);
+ }
+ page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
+ page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
+ page.putField("description", page.getHtml()
+ .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
+ page.putField("material", page.getHtml()
+ .xpath("//div[@id='tabbedDescription']" +
+ "//div[@class='tabbedDescription']" +
+ "//ul[@id='tabs']" +
+ "//li[@id='tab_description']" +
+ "//div[@class='productProperty']" +
+ "//div[@class='productPropertyRow']/span[2]/tidyText()"));
+ page.putField("goodsCode", page.getHtml()
+ .xpath("//div[@id='tabbedDescription']" +
+ "//div[@class='tabbedDescription']" +
+ "//ul[@id='tabs']" +
+ "//li[@id='tab_description']" +
+ "//div[@class='productProperty']" +
+ "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
+ page.putField("goodsSize", page.getHtml()
+ .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
+ page.putField("goodsColors", page.getHtml()
+ .xpath("//div[@id='colors']/ul/html()"));
+ } else {
+ page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
+ page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new AlexanderMcqueenGoodsProcessor()).setScheduler(new PriorityScheduler())
+ .addUrl("http://www.alexandermcqueen.cn/sitemap.asp?tskay=E2F1A848").thread(5).run();
+ }
+}
\ No newline at end of file
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java
new file mode 100644
index 000000000..a980851bf
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java
@@ -0,0 +1,53 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Html;
+
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class AmanzonPageProcessor implements PageProcessor{
+ public void process(Page page) {
+
+ Html html = page.getHtml();
+ List questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
+
+ if(questionList != null && questionList.size() > 1)
+ {
+ //i=0是列名称,所以i从1开始
+ for( int i = 1 ; i < questionList.size(); i++)
+ {
+ System.out.println(questionList.get(i));
+ Html tempHtml = Html.create("");
+ String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
+ System.out.println(comment);
+ String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
+ System.out.println(answerNum);
+ String createTime = tempHtml.xpath("//td[3]/text()").toString();
+ System.out.println(createTime);
+
+ /* Document doc = Jsoup.parse(questionList.get(i));
+ Html hmt = Html.create(questionList.get(i)) ;
+ String str = hmt.links().toString();
+ String content = doc.getElementsByTag("a").text();
+ String ss = doc.text();*/
+
+ }
+ }
+
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me();
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D");
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
new file mode 100644
index 000000000..46476bbc8
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
@@ -0,0 +1,48 @@
+package us.codecraft.webmagic.samples;
+
+
+import java.util.List;
+import org.apache.commons.collections4.CollectionUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.JsonPathSelector;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.5.0
+ */
+public class AngularJSProcessor implements PageProcessor {
+
+ private Site site = Site.me();
+
+ private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+";
+
+ private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*";
+
+ @Override
+ public void process(Page page) {
+ if (page.getUrl().regex(LIST_URL).match()) {
+ List ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
+ if (CollectionUtils.isNotEmpty(ids)) {
+ for (String id : ids) {
+ page.addTargetRequest("http://angularjs.cn/api/article/" + id);
+ }
+ }
+ } else {
+ page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
+ page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
+ }
+
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new AngularJSProcessor()).addUrl("http://angularjs.cn/api/article/latest?p=1&s=20").run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
index 25baa1fbf..8bd7d5893 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
@@ -35,7 +35,7 @@ public void process(Page page) {
public Site getSite() {
//site定义抽取配置,以及开始url等
if (site == null) {
- site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
+ site = Site.me().setDomain("progressdaily.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
new file mode 100644
index 000000000..61458d0f9
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
@@ -0,0 +1,46 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.PlainText;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21
+ * Time: 下午8:08
+ */
+public class DiaoyuwengProcessor implements PageProcessor {
+
+ private Site site;
+
+ @Override
+ public void process(Page page) {
+ List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
+ page.addTargetRequests(requests);
+ requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
+ page.addTargetRequests(requests);
+ if (page.getUrl().toString().contains("thread")){
+ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
+ page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
+ page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
+ page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ if (site==null){
+ site= Site.me().setDomain("www.diaoyuweng.com").
+ setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
+ }
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
new file mode 100644
index 000000000..8091b6502
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.scheduler.RedisScheduler;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 13-4-21
+ * Time: 下午1:48
+ */
+public class F58PageProcesser implements PageProcessor {
+
+ @Override
+ public void process(Page page) {
+ List strings = page.getHtml().links().regex(".*/yewu/.*").all();
+ page.addTargetRequests(strings);
+ page.putField("title",page.getHtml().regex("(.*) "));
+ page.putField("body",page.getHtml().xpath("//dd"));
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
new file mode 100644
index 000000000..0aecb7bf5
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.samples;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class GithubRepo {
+
+ private String name;
+
+ private String author;
+
+ private String readme;
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(String author) {
+ this.author = author;
+ }
+
+ public String getReadme() {
+ return readme;
+ }
+
+ public void setReadme(String readme) {
+ this.readme = readme;
+ }
+}
\ No newline at end of file
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
new file mode 100644
index 000000000..3f4e190c6
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.5.1
+ */
+public class GithubRepoPageProcessor implements PageProcessor {
+
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+ @Override
+ public void process(Page page) {
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
+ GithubRepo githubRepo = new GithubRepo();
+ githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
+ githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
+ githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
+ if (githubRepo.getName() == null) {
+ //skip this page
+ page.setSkip(true);
+ } else {
+ page.putField("repo", githubRepo);
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
index 7cb7be2ca..1cc90b081 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
@@ -21,10 +21,11 @@ public void process(Page page) {
@Override
public Site getSite() {
- return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
+ return Site.me().setDomain("www.huxiu.com");
}
public static void main(String[] args) {
- Spider.create(new HuxiuProcessor()).run();
+ Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run();
}
+
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
index 3ef395742..33dd6aa35 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
@@ -1,6 +1,6 @@
package us.codecraft.webmagic.samples;
-import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
@@ -29,7 +29,7 @@ public void process(Page page) {
@Override
public Site getSite() {
if (site == null) {
- site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
+ site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;
@@ -38,6 +38,7 @@ public Site getSite() {
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.thread(5)
+ .addUrl("http://www.infoq.com/cn/minibooks")
.run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
index 26b85e878..6dce8075c 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
@@ -22,12 +22,12 @@ public void process(Page page) {
@Override
public Site getSite() {
if (site == null) {
- site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
+ site = Site.me().setDomain("yanghaoli.iteye.com");
}
return site;
}
public static void main(String[] args) {
- Spider.create(new IteyeBlogProcessor()).thread(5).run();
+ Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
new file mode 100644
index 000000000..b373f5204
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
@@ -0,0 +1,32 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 13-5-20
+ * Time: 下午5:31
+ */
+public class KaichibaProcessor implements PageProcessor {
+ @Override
+ public void process(Page page) {
+ //http://progressdaily.diandian.com/post/2013-01-24/40046867275
+ int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
+ page.addTargetRequest("http://kaichiba.com/shop/" + i);
+ page.putField("title",page.getHtml().xpath("//Title"));
+ page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*? ", ""));
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("kaichiba.com").setCharset("utf-8").
+ setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java
new file mode 100644
index 000000000..22ae5eb42
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java
@@ -0,0 +1,50 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.samples.pipeline.OneFilePipeline;
+import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
+import us.codecraft.webmagic.selector.Selectable;
+
+import java.io.FileNotFoundException;
+import java.io.UnsupportedEncodingException;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class MamacnPageProcessor implements PageProcessor {
+
+ private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100);
+
+ @Override
+ public void process(Page page) {
+ List nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
+ StringBuilder accum = new StringBuilder();
+ for (Selectable node : nodes) {
+ accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
+ accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
+ }
+ page.putField("",accum.toString());
+ if (accum.length() == 0) {
+ page.setSkip(true);
+ }
+ page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
+ Spider.create(new MamacnPageProcessor())
+ .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
+ .addUrl("http://www.mama.cn/photo/t1-p1.html")
+ .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
+ .thread(5)
+ .run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
new file mode 100644
index 000000000..cb4c498ff
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
@@ -0,0 +1,38 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 13-5-20
+ * Time: 下午5:31
+ */
+public class MeicanProcessor implements PageProcessor {
+ @Override
+ public void process(Page page) {
+ //http://progressdaily.diandian.com/post/2013-01-24/40046867275
+ List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
+ if (requests.size() > 2) {
+ requests = requests.subList(0, 2);
+ }
+ page.addTargetRequests(requests);
+ page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
+ page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
+ page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setDomain("meican.com").setCharset("utf-8").
+ setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
index 16dcb0cb5..ce0f817d7 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
@@ -1,7 +1,8 @@
package us.codecraft.webmagic.samples;
-import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
@@ -22,6 +23,10 @@ public void process(Page page) {
@Override
public Site getSite() {
- return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
+ return Site.me().setDomain("bbs.nju.edu.cn");
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run();
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
deleted file mode 100644
index ded1a5f4e..000000000
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package us.codecraft.webmagic.samples;
-
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-import java.util.List;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class OschinaBlogPageProcesser implements PageProcessor {
-
- private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
-
- @Override
- public void process(Page page) {
- List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
- page.addTargetRequests(links);
- page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
- page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
- page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
- }
-
- @Override
- public Site getSite() {
- return site;
-
- }
-
- public static void main(String[] args) {
- Spider.create(new OschinaBlogPageProcesser()).run();
- }
-}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
deleted file mode 100644
index b75cc8320..000000000
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package us.codecraft.webmagic.samples;
-
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-import java.util.List;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class OschinaPageProcesser implements PageProcessor {
-
- @Override
- public void process(Page page) {
- List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
- page.addTargetRequests(strings);
- page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
- page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
- }
-
- @Override
- public Site getSite() {
- return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
- setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
- }
-}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
new file mode 100644
index 000000000..ab5314073
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
@@ -0,0 +1,54 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.downloader.PhantomJSDownloader;
+import us.codecraft.webmagic.pipeline.CollectorPipeline;
+import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.List;
+
+/**
+ * Created by dolphineor on 2014-11-21.
+ *
+ * 以淘宝为例, 搜索冬装的相关结果
+ */
+public class PhantomJSPageProcessor implements PageProcessor {
+
+ private Site site = Site.me()
+ .setDomain("s.taobao.com")
+ .setCharset("GBK")
+ .addHeader("Referer", "http://www.taobao.com/")
+ .setRetryTimes(3).setSleepTime(1000);
+
+ @Override
+ public void process(Page page) {
+ if (page.getRawText() != null)
+ page.putField("html", page.getRawText());
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) throws Exception {
+ PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
+
+ CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline();
+
+ Spider.create(new PhantomJSPageProcessor())
+ .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
+ .setDownloader(phantomDownloader)
+ .addPipeline(collectorPipeline)
+ .thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
+ .run();
+
+ List resultItemsList = collectorPipeline.getCollected();
+ System.out.println(resultItemsList.get(0).get("html").toString());
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
index d9cee2beb..037b333c8 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
@@ -24,7 +24,7 @@ public void process(Page page) {
@Override
public Site getSite() {
- return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
+ return Site.me().setDomain("www.diandian.com").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
deleted file mode 100644
index dcb6eff92..000000000
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java
+++ /dev/null
@@ -1,37 +0,0 @@
-package us.codecraft.webmagic.samples;
-
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.processor.PageProcessor;
-
-/**
- * @author code4crafter@gmail.com
- */
-public class SinaBlogProcesser implements PageProcessor {
-
- private Site site;
-
- @Override
- public void process(Page page) {
- page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all());
- page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
- page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
- page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)"));
- page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
-// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a"));
- }
-
- @Override
- public Site getSite() {
- if (site==null){
- site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000).
- setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
- }
- return site;
- }
-
- public static void main(String[] args) {
- Spider.create(new SinaBlogProcesser()).run();
- }
-}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java
new file mode 100644
index 000000000..2872e02b5
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java
@@ -0,0 +1,48 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class SinaBlogProcessor implements PageProcessor {
+
+ public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html";
+
+ public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html";
+
+ private Site site = Site
+ .me()
+ .setDomain("blog.sina.com.cn")
+ .setSleepTime(3000)
+ .setUserAgent(
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
+
+ @Override
+ public void process(Page page) {
+ //列表页
+ if (page.getUrl().regex(URL_LIST).match()) {
+ page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
+ page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
+ //文章页
+ } else {
+ page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
+ page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
+ page.putField("date",
+ page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
+ .run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
index d14b44206..6cc8f99a9 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
@@ -21,6 +21,6 @@ public void process(Page page) {
@Override
public Site getSite() {
- return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
+ return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
}
}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZhihuPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZhihuPageProcessor.java
new file mode 100644
index 000000000..c21946064
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZhihuPageProcessor.java
@@ -0,0 +1,61 @@
+package us.codecraft.webmagic.samples;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.pipeline.FilePipeline;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Html;
+
+import java.util.List;
+
+/**
+ * @author 410775541@qq.com
+ * @since 0.5.1
+ */
+public class ZhihuPageProcessor implements PageProcessor {
+
+ private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
+ .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
+ .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+ .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
+ .setCharset("UTF-8");
+
+ private static final int voteNum = 1000;
+
+
+ @Override
+ public void process(Page page) {
+ List relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
+ page.addTargetRequests(relativeUrl);
+ relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
+ page.addTargetRequests(relativeUrl);
+ List answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
+ boolean exist = false;
+ for(String answer:answers){
+ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
+ if(Integer.valueOf(vote) >= voteNum){
+ page.putField("vote",vote);
+ page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
+ page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
+ exist = true;
+ }
+ }
+ if(!exist){
+ page.setSkip(true);
+ }
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider.create(new ZhihuPageProcessor()).
+ addUrl("http://www.zhihu.com/search?type=question&q=java").
+ addPipeline(new FilePipeline("D:\\webmagic\\")).
+ thread(5).
+ run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java
new file mode 100644
index 000000000..7b38125cd
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java
@@ -0,0 +1,26 @@
+package us.codecraft.webmagic.samples.formatter;
+
+import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+
+/**
+ * @author yihua.huang@dianping.com
+ */
+public class StringTemplateFormatter implements ObjectFormatter {
+
+ private String template;
+
+ @Override
+ public String format(String raw) throws Exception {
+ return String.format(template, raw);
+ }
+
+ @Override
+ public Class clazz() {
+ return String.class;
+ }
+
+ @Override
+ public void initParam(String[] extra) {
+ template = extra[0];
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java
new file mode 100644
index 000000000..4f38ecb1f
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java
@@ -0,0 +1,47 @@
+package us.codecraft.webmagic.samples.pipeline;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+import us.codecraft.webmagic.utils.FilePersistentBase;
+
+import java.io.*;
+import java.util.Map;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class OneFilePipeline extends FilePersistentBase implements Pipeline {
+
+ private Logger logger = LoggerFactory.getLogger(getClass());
+
+ private PrintWriter printWriter;
+
+ public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
+ this("/data/webmagic/");
+ }
+
+ public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
+ setPath(path);
+ printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
+ }
+
+ @Override
+ public synchronized void process(ResultItems resultItems, Task task) {
+ printWriter.println("url:\t" + resultItems.getRequest().getUrl());
+ for (Map.Entry entry : resultItems.getAll().entrySet()) {
+ if (entry.getValue() instanceof Iterable) {
+ Iterable value = (Iterable) entry.getValue();
+ printWriter.println(entry.getKey() + ":");
+ for (Object o : value) {
+ printWriter.println(o);
+ }
+ } else {
+ printWriter.println(entry.getKey() + ":\t" + entry.getValue());
+ }
+ }
+ printWriter.flush();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java
new file mode 100644
index 000000000..2458c8a76
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java
@@ -0,0 +1,7 @@
+package us.codecraft.webmagic.samples.pipeline;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class ReplacePipeline {
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java
index ddbaa088b..3f2de70c5 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java
@@ -9,8 +9,9 @@
import us.codecraft.webmagic.scheduler.PriorityScheduler;
import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
-import static us.codecraft.webmagic.selector.Selectors.regex;
import static us.codecraft.webmagic.selector.Selectors.xpath;
/**
@@ -19,16 +20,16 @@
public class ZipCodePageProcessor implements PageProcessor {
private Site site = Site.me().setCharset("gb2312")
- .setSleepTime(100).addStartUrl("http://www.ip138.com/post/");
+ .setSleepTime(100);
@Override
public void process(Page page) {
if (page.getUrl().toString().equals("http://www.ip138.com/post/")) {
processCountry(page);
- } else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) {
- processProvince(page);
- } else {
+ } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) {
processDistrict(page);
+ } else {
+ processProvince(page);
}
}
@@ -45,28 +46,26 @@ private void processCountry(Page page) {
private void processProvince(Page page) {
//这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
- List districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all();
+ List districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
+ Pattern pattern = Pattern.compile("([^<>]+) .*?href=\"(.*?)\"",Pattern.DOTALL);
for (String district : districts) {
- String link = xpath("//@href").select(district);
- String title = xpath("/text()").select(district);
- Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
- page.addTargetRequest(request);
+ Matcher matcher = pattern.matcher(district);
+ while (matcher.find()) {
+ String title = matcher.group(1);
+ String link = matcher.group(2);
+ Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
+ page.addTargetRequest(request);
+ }
}
}
private void processDistrict(Page page) {
String province = page.getRequest().getExtra("province").toString();
String district = page.getRequest().getExtra("district").toString();
- List counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*\\d+ .*").all();
- String regex = "]*>([^<>]+) ]*>([^<>]+) ]*>([^<>]+) ]*>([^<>]+) ";
- for (String county : counties) {
- String county0 = regex(regex, 1).select(county);
- String county1 = regex(regex, 2).select(county);
- String zipCode = regex(regex, 3).select(county);
- page.putField("result", StringUtils.join(new String[]{province, district,
- county0, county1, zipCode}, "\t"));
- }
- List links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all();
+ String zipCode = page.getHtml().regex("邮编:(\\d+) ").toString();
+ page.putField("result", StringUtils.join(new String[]{province, district,
+ zipCode}, "\t"));
+ List links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
for (String link : links) {
page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
}
@@ -79,11 +78,8 @@ public Site getSite() {
}
public static void main(String[] args) {
- Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run();
+ Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).addUrl("http://www.ip138.com/post/");
- PriorityScheduler scheduler = new PriorityScheduler();
- Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler);
- scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider);
spider.run();
}
}
diff --git a/webmagic-samples/src/main/resources/crawl.js b/webmagic-samples/src/main/resources/crawl.js
new file mode 100644
index 000000000..c9cf01cd1
--- /dev/null
+++ b/webmagic-samples/src/main/resources/crawl.js
@@ -0,0 +1,17 @@
+var system = require('system');
+var url = system.args[1];
+
+var page = require('webpage').create();
+page.settings.loadImages = false;
+page.settings.resourceTimeout = 5000;
+
+page.open(url, function (status) {
+ if (status != 'success') {
+ console.log("HTTP request failed!");
+ } else {
+ console.log(page.content);
+ }
+
+ page.close();
+ phantom.exit();
+});
\ No newline at end of file
diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml
deleted file mode 100644
index a6630f813..000000000
--- a/webmagic-samples/src/main/resources/log4j.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml
new file mode 100644
index 000000000..f3bad53d8
--- /dev/null
+++ b/webmagic-samples/src/main/resources/log4j2.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
index dbfa81548..f8dfb9793 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
@@ -18,7 +18,7 @@ public class SpiderTest {
@Ignore
@Test
public void testSpider() throws InterruptedException {
- Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
+ Spider me = Spider.create(new HuxiuProcessor()).addPipeline(new FilePipeline());
me.run();
}
@@ -28,10 +28,10 @@ public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
- SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
+ SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
- Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
+ Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
index 5513305d7..7c6192692 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
@@ -17,7 +17,7 @@ public class ProcessorBenchmark {
@Ignore
@Test
public void test() {
- ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
+ ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
Page page = new Page();
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
@@ -150,7 +150,7 @@ public void test() {
"#MyResume textarea {width:170px;height:60px;font-size:9pt;}\n" +
"\n" +
"\n" +
- "码农一枚 实用主义者 抵制重复造轮子,却造了不少轮子 http://codecraft.us
\n" +
+ "码农一枚 实用主义者 抵制重复造轮子,却造了不少轮子 http://codecraft.us\n" +
"\n" +
"\n" + "\n" + " \n" + " \n" + " \n" + "