lma8811
Lomon----
2018-09-07 15:58
采纳率: 18.2%
浏览 2.1k
已采纳

java采集页面显示202状态

最近测试采集公共资源交易的页面,出来202问题,无法采集最终页面内容,希望给予页面采集的完整JAVA代码,谢谢!

附测试代码:
package asptest;

import java.io.IOException;
import java.net.URI;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie2;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
/*

  • author:合肥工业大学 管院学院 钱洋
    *1563178220@qq.com
    */
    public class Testggzy {

    public static void main(String[] args) throws ClientProtocolException, IOException, InterruptedException {
    getRawHTML("http://www.ggzy.gov.cn/information/html/b/500000/0201/201808/30/005073ad2bc4036b4335a46cf421674b341f.shtml");

    }
    public static String getRawHTML ( String url ) throws ClientProtocolException, IOException, InterruptedException{
    //初始化
    DefaultHttpClient httpclient = new DefaultHttpClient();
    httpclient.getParams().setParameter("http.protocol.cookie-policy",

    CookiePolicy.BROWSER_COMPATIBILITY);
    //设置参数
    HttpParams params = httpclient.getParams();
    //连接时间
    HttpConnectionParams.setConnectionTimeout(params, 6000);

    HttpConnectionParams.setSoTimeout(params, 6000*20);

    //超时重新请求次数
    DefaultHttpRequestRetryHandler dhr = new DefaultHttpRequestRetryHandler(5,true);

    HttpContext localContext = new BasicHttpContext();

    HttpRequest request2 = (HttpRequest) localContext.getAttribute(

    ExecutionContext.HTTP_REQUEST);

    httpclient.setHttpRequestRetryHandler(dhr);
    BasicCookieStore cookieStore = new BasicCookieStore();

    BasicClientCookie2 cookie = new BasicClientCookie2("Content-Type","text/html;charset=UTF-8");

    BasicClientCookie2 cookie1 = new BasicClientCookie2("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36");
    cookieStore.addCookie(cookie);
    cookieStore.addCookie(cookie1);
    localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);

    HttpGet request = new HttpGet();
    request.setURI(URI.create(url));
    HttpResponse response = null;

    String rawHTML = "";
    response = httpclient.execute(request,localContext);
    int StatusCode = response.getStatusLine().getStatusCode(); //获取响应状态码
    System.out.println(StatusCode);
    if(StatusCode == 200){ //状态码200表示响应成功
    //获取实体内容
    rawHTML = EntityUtils.toString (response.getEntity());
    System.out.println(rawHTML);
    //输出实体内容
    EntityUtils.consume(response.getEntity()); //消耗实体
    }else {
    //关闭HttpEntity的流实体
    EntityUtils.consume(response.getEntity()); //消耗实体
    Thread.sleep(20*60*1000); //如果报错先休息30分钟
    }
    httpclient.close();
    System.out.println(rawHTML);
    return rawHTML;
    }
    }

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

3条回答 默认 最新

  • lma8811
    Lomon---- 2019-01-01 15:07
    已采纳

    http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp
    改为这里找到请求数据的格式,读到JSON,解决链接的问题

    点赞 评论
  • zqbnqsdsmd
    zqbnqsdsmd 2018-09-07 16:00

    1563178220@qq.com,既然有邮箱,你问问这个人

    点赞 评论
  • weixin_39416561
    lyhsdy 2018-09-10 01:45

    访问的网址 getRawHTML("http://www.ggzy.gov.cn/information/html/b/500000/0201/201808/30/005073ad2bc4036b4335a46cf421674b341f.shtml") 打开的网址就是打不开的,修改下这里你所需要的网址就可以了

    点赞 评论

相关推荐