需要对这个页面(http://liuyan.people.com.cn/threads/list?fid=565)进行设计采集程序,由于“加载更多”就是当页最后一个ID,如:
http://liuyan.people.com.cn/threads/list?fid=565
的JSON数据页面为
http://liuyan.people.com.cn/threads/queryThreadsList?fid=565&lastItem=0
JSON数据最后一个TID是7127843,那第二页即是
http://liuyan.people.com.cn/threads/queryThreadsList?fid=565&lastItem=7127843
JSON数据最后一个TID是7125108,那第三页即是
http://liuyan.people.com.cn/threads/queryThreadsList?fid=565&lastItem=7125108
依此类推,当responseData中数据条目小于10条或空时,即结束。
以下代码会采集切换5~6页后,在执行DealjsonData()的response = httpClient.execute(httpPost);后报错org.apache.http.NoHttpResponseException: liuyan.people.com.cn:80 failed to respond
有什么好的解决办法,如测试通过就全部给分,谢谢。
以下是部分代码借鉴。
public static void gettitle(String tablename,String province,String cityname,String district,String leaderstype,String leadersname,String queryurl,String url,String proxyname,int proxyport)
{
String[] Pagecountsp = null;
boolean tmptable=true;
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
HttpClient httpClient = null;
HttpPost httpPost = null;
HttpResponse response = null;
try{
//WebClient webClient = new WebClient(BrowserVersion.CHROME,"45.63.121.157",80);
DataBaseHelper dbh = new DataBaseHelper(tablename);
dbh.onCreate(tablename);
String tmp_tablename=dbh.onCreatetmp(tablename); //建立临时库和清空临时库
System.out.println(df.format(System.currentTimeMillis())+" "+"queryurl:"+queryurl+" tablename:"+tablename+" leadersname:"+leadersname);
HtmlPage page=null;
int pagenum=20;
WebClient webClient = new WebClient(BrowserVersion.CHROME);
ProxyConfig proxyConfig = new ProxyConfig(proxyname,proxyport);
webClient.getOptions().setTimeout(3000000);
//webClient.getOptions().setProxyConfig(proxyConfig);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
httpClient = HttpClients.createDefault();
httpPost = new HttpPost(queryurl);// 传入URL地址
httpPost.addHeader("Accept",
"pplication/json, text/javascript, */*; q=0.01");
httpPost.addHeader("Accept-Encoding", "gzip, deflate");// 设置请求头
httpPost.addHeader("Accept-Language",
"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");// 设置请求头
//httpPost.addHeader("Connection", "keep-alive");// 设置请求头
// httpPost.addHeader("Content-Length", "19");//设置请求头
//httpPost.addHeader("Content-Type",
// "application/x-www-form-urlencoded; charset=UTF-8");// 设置请求头
httpPost.addHeader("Referer",
"http://liuyan.people.com.cn/threads/list?fid=1079");// 设置请求头
//httpPost.addHeader("User-Agent",
// "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/64.0");// 设置请求头
//httpPost.addHeader("X-Requested-With", "XMLHttpRequest");// 设置请求头
response = httpClient.execute(httpPost);// 获取响应
int statusCode = response.getStatusLine().getStatusCode();
//System.out.println("HTTP Status Code:" + statusCode);
if (statusCode != HttpStatus.SC_OK) {
//System.out.println("HTTP请求未成功!HTTP Status Code:");//+response.getStatusLine())
}
HttpEntity httpEntity = response.getEntity();
String reponseContent = EntityUtils.toString(httpEntity);
EntityUtils.consume(httpEntity);
reponseContent=reponseContent.replaceAll("null", "0");
//System.out.println("响应内容:" + reponseContent);
if(reponseContent.length()>20){
JSONObject dealList_findjson = JSONObject.fromObject(reponseContent);
//System.out.println(dealList_findjson.toString());
//System.out.println("页数:"+dealList_findjson.getInt("ttlpage")+"记录数:"+dealList_findjson.getInt("ttlrow"));
//int RecordCou = dealList_findjson.getInt("ttlrow"); //记录数
//int PageCou = dealList_findjson.getInt("ttlpage"); //页数
//int RecordCou = PageCou*pagenum; //预计最大记录总数,最后一页可能不是20
int DBcout=dbh.DBcount(tablename,leadersname); //
//System.out.println(" 总页数:"+PageCou+" 记录数:"+RecordCou+" 数据库最早时间:"+DBnewstime+" 近1月内数据库条目:"+DB3cout+" 数据库总条目:"+DBcout);
if(dealList_findjson.size()>0){ //在今天截止到表中最新时间的URL筛选如有新数据,由于可以直接调整WEB显示时间,不用增量模式,改用选择模式
System.out.println("开始采集:");
//System.out.println(doc.toString());
int continues=getListData(dbh,queryurl,url,dealList_findjson,tmp_tablename,tablename,province,cityname,district,leaderstype,leadersname);
for (int i=0;continues>0;i++) //PageCount-&&i<2
{ //continues代表只有不到10条数据,意味着结尾了,默认是0,否则tid=continues;
//System.out.println(continues);
reponseContent=DealjsonData(queryurl,continues); //调用读取XHR的JSON数据
if(reponseContent.indexOf("success")>1){
reponseContent=reponseContent.replaceAll("null", "0");
//System.out.println("响应内容:" + reponseContent);
if(reponseContent.length()>20){
dealList_findjson = JSONObject.fromObject(reponseContent); //转化为json
continues=getListData(dbh,queryurl,url,dealList_findjson,tmp_tablename,tablename,province,cityname,district,leaderstype,leadersname); //getListData对获取的JSON数据进行返回最后一条TID值,用于翻页,否则为0结束,插入数据。
//System.out.println("continues:" + continues+" i:" + i);
}
else{System.out.println("获取JSON数据异常l!");}
}
else{System.out.println("获取XHR的JSON数据失败!");
break;}
}
}
dbh.Dedup_Add_table(tablename,tmp_tablename,province,leaderstype,leadersname,cityname,district,tmptable); //增加了临时的数据库,进行重删方式新增
}
//20180114改为不删除临时tmp库,即无论tmp_zfcg0或tmpzfcg3;本来每次在onCreatetmp头部就会清空库truncate
//dbh.TBdel(tmp_tablename); //删除临时库
webClient.close();
} catch(Exception e){
// TODO Auto-generated catch block
e.printStackTrace();
StringWriter sw = new StringWriter();
e.printStackTrace(new PrintWriter(sw, true));
String strs = sw.toString();
log.error("url:"+url+" gettitle error:"+ strs);
}
}
//获取XHR中的JSON数据
static String DealjsonData(String url,int pageNo) {
String url1=url.substring(0,url.lastIndexOf("lastItem=")+9)+pageNo;
System.out.println("url:"+url1);
//ConnectionConfig connectionConfig = ConnectionConfig.custom().setBufferSize(4128).build();
//HttpClient httpClient = HttpClients.custom().setDefaultConnectionConfig(connectionConfig).build();
HttpClient httpClient = null;
HttpPost httpPost = null;
HttpResponse response = null;
String reponseContent=null;
try{
httpClient = HttpClients.createDefault();
httpPost = new HttpPost(url1);// 传入URL地址
httpPost.addHeader("Accept","pplication/json, text/javascript, */*; q=0.01");
httpPost.addHeader("Accept-Encoding", "gzip, deflate");// 设置请求头
httpPost.addHeader("Accept-Language","zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");// 设置请求头
httpPost.addHeader("Connection", "keep-alive");// 设置请求头
// httpPost.addHeader("Content-Length", "19");//设置请求头
httpPost.addHeader("Content-Type","application/x-www-form-urlencoded; charset=utf-8");// 设置请求头
httpPost.addHeader("Referer",
"http://liuyan.people.com.cn/threads/list?fid=1079");// 设置请求头
//httpPost.addHeader("User-Agent",
// "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/64.0");// 设置请求头
//httpPost.addHeader("X-Requested-With", "XMLHttpRequest");// 设置请求头
response = httpClient.execute(httpPost);// 获取响应
int statusCode = response.getStatusLine().getStatusCode();
//System.out.println("HTTP Status Code:" + statusCode);
if (statusCode != HttpStatus.SC_OK) {
//System.out.println("HTTP请求未成功!HTTP Status Code:"+ response.getStatusLine());
}
HttpEntity httpEntity = response.getEntity();
reponseContent = EntityUtils.toString(httpEntity);
EntityUtils.consume(httpEntity);
//System.out.println("响应内容:" + reponseContent);
return reponseContent;
} catch(Exception e){
// TODO Auto-generated catch block
e.printStackTrace();
StringWriter sw = new StringWriter();
e.printStackTrace(new PrintWriter(sw, true));
String strs = sw.toString();
log.error("table_name:"+url+"url1:"+url1+" ListData error:"+ strs);
return reponseContent;
}
}