package alp.starcode.example;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Component
public class NewsProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36").setRetryTimes(3).setSleepTime(1000);
private News news = new News();
private List<Map<String,String>> mapList= new ArrayList<>();
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//判断是否为首页
String s = page.getHtml().xpath("//div[@class='list']").toString();
//判断是列表还是详情页
Html html = page.getHtml();
List<Selectable> nodes = html.css(".list > .news-item").nodes();
if (nodes.size()>0){
List<News> newsList = new ArrayList<>();
for (Selectable node : nodes) {
//获取节点里面的值
String abstractContent = node.css(".news-item > .item-content > p", "text").get();
String title = node.css(".news-item > .item-content > .text-hidden> a", "text").get();
String author= "";
String publishTime = "";
news = new News();
Map<String,String> map = new HashMap<>();
map.put("title",title);
map.put("abstract",abstractContent);
mapList.add(map);
news.setImgUrl(imgUrl);
String detailUrl = node.xpath("//a/@href").get();
List<String> urls = new ArrayList<>();
if (detailUrl!=null){
urls.add(detailUrl);
page.addTargetRequests(urls);
//String s1 = page.getHtml().toString();
//System.out.println(s1);
}
}
}else {
//解析详情页
parseInfo(page,news);
}
}
//解析详情页
public void parseInfo(Page page,News news){
//解析页面中的内容
Html html = page.getHtml();
String content = page.getHtml().xpath("//div[@class='article-content']").toString();
String time = html.css("article > .article-header >.clearfix > .article-from > span", "text").get();
String name = html.css("article > .article-header >.clearfix > .article-from > .info-detail", "text").get();
String title = html.css("article > .article-header > h1", "text").get();
news.setAuthor(name);
news.setTitle(title);
news.setContent(content);
news.setPublishTime(time);
//把结果传递给pipeline
page.putField("news",news);
page.putField("mapList",mapList);
}
@Override
public Site getSite() {
return site;
}
public void crawling(){
Spider.create(new NewsProcessor())
//自定义的pipeline
.addPipeline(new NewsPipeline())
//从"https://github.com/code4craft"开始抓
.addUrl("http://www.echinagov.com/news/")
//.addUrl("http://www.echinagov.com/node/117_2/")
//开启1个线程抓取,防止高并发情况
.thread(1)
//启动爬虫
.run();
}
public static void main(String[] args) {
Spider.create(new NewsProcessor())
//自定义的pipeline
.addPipeline(new NewsPipeline())
//从"https://github.com/code4craft"开始抓
//.addUrl("http://www.echinagov.com/news/")
.addUrl("http://www.echinagov.com/node/117_3/")
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
}
想换一个页面布局不一样的网站爬虫,应该改哪些地方的哪些方面呢?addUrl已会改,UserAgent已改,其他的地方应该改哪里呢?