代码
package my.webmagic;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
public class GetComments implements PageProcessor {
// 对爬取站点的一些属性进行设置,例如:设置域名,设置代理等;
private Site site = Site.me().setDomain("10.0.47.153:32400").setSleepTime(2000);
public Site getSite() {
return site;
}
public void process(Page page) {
page.putField("ten", page.getJson().toString());
}
public static void main(String[] args) {
String url_init = "http://10.0.47.153:32400/jdjson?callback=fetchJSON_comment98vv111&productId=39215375204&score=0&sortType=5&pageSize=10&isShadowSku=0&fold=1&page=1";
String url_pattern = "http://10.0.47.153:32400/jdjson?callback=fetchJSON_comment98vv111&productId=39215375204&score=0&sortType=5&pageSize=10&isShadowSku=0&fold=1&page=";
String output = "/data/edu1/tmp/";
QueueScheduler scheduler = new QueueScheduler();
Spider spider = Spider.create(new GetComments()).addUrl(url_init)
.setScheduler(scheduler)
.addPipeline(new JsonFilePipeline(output))
.addPipeline(new ConsolePipeline());
for (int i = 0; i < 100; i++) {
Request request = new Request();
request.setUrl(url_pattern + i);
scheduler.push(request, spider);
}
spider.thread(5).run();
}
}
运行截图