out-println 2017-08-17 06:39 采纳率: 0%
浏览 2540
已结题

jsoup 获取数据出错。换了代理ip也不行

如题。jsoup获取数据出错。用了代理去访问也不行。
先贴爬取页面的代码

 @Slf4j
@Component
public class SpiderUtil {

    @Resource
    private DynamicIpUtil dynamicIpUtil;

    /**
     * 根据url爬取页面信息
     *
     * @param url url
     * @return 页面信息
     */
    public Document spiderDocument(String url) {
        Document pageDoc = null;
        try {
            Connection con= Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)")
                    .timeout(5000);
            /*.ignoreHttpErrors(true)
            .followRedirects(true)*/
            Connection.Response resp = con.execute();
            if (resp.statusCode() == 200){
                pageDoc = con.get();
            } else {
                log.error("http status error");
                dynamicIpUtil.changeMyIp();
                spiderDocument(url);
            }
            if(pageDoc == null || pageDoc.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
                log.error("ip被拦截 无内容");
                dynamicIpUtil.changeMyIp();
                spiderDocument(url);
            }

        } catch (Exception e) {
            log.error("ip被拦截 异常: {}", e);
            dynamicIpUtil.getMyIpInfo();
            dynamicIpUtil.changeMyIp();
            spiderDocument(url);
        }
        if (ipDefensed(url, pageDoc)) {
            // 如果被ip限制了,更换动态ip
            dynamicIpUtil.changeMyIp();
            spiderDocument(url);
        }
        return pageDoc;
    }

    /**
     * 判断ip是否被封
     *
     * @param pageDoc 页面信息
     * @return ip
     */
    private boolean ipDefensed(String url, Document pageDoc) {
        boolean ipDefensed = false;
        if (url.contains("anjuke.com")) {
            ipDefensed = AJKIpDefense(pageDoc);
        }
        return ipDefensed;
    }


    /**
     * 安居客判断ip是否被封
     *
     * @param pageDoc 页面信息
     */
    private boolean AJKIpDefense(Document pageDoc) {
        log.error("ip 被拦截 安居客");
        boolean ajkppDefensed = false;
        String title = pageDoc.title();
        if (title.equals("访问验证-安居客")) {
            ajkppDefensed = true;
        }
        return ajkppDefensed;
    }
}

再贴换动态ip的代码

 @Slf4j
@Component
public class DynamicIpUtil {

    private static List<String[]> ipAndPorts = new ArrayList<String[]>();

    private static Integer ipPageNum = 1;

    /**
     * 更换动态ip
     */
    public void changeMyIp() {
        String [] ipAndPort = getDynamicIpAndPort();
        String ip = ipAndPort[0];
        String port = ipAndPort[1];
        System.setProperty("http.maxRedirects", "50");
        System.setProperty("https.maxRedirects", "50");
        System.getProperties().setProperty("proxySet", "true");
        System.getProperties().setProperty("http.proxyHost", ip);
        System.getProperties().setProperty("http.proxyPort", port);
        System.getProperties().setProperty("https.proxyHost", ip);
        System.getProperties().setProperty("https.proxyPort", port);
    }

    /**
     * 获取ip信息
     */
    public void getMyIpInfo(){
        try {
            Document ipDoc = Jsoup.connect("http://www.ip.cn")
                    .userAgent("Mozilla")
                    .timeout(3000)
                    .get();
            if(ipDoc != null){
                String ipInfo = ipDoc.select(".well").first().text();
                log.info("更换ip 成功: {}", ipInfo);
            }
        } catch (Exception e) {
            log.info("暂不能获取ip 信息");
        }
    }

    /**
     * 获取动态ip
     *
     * @return 动态ip
     */
    private String[] getDynamicIpAndPort() {
        String[] ipAndPort = null;
        if (ipAndPorts != null && ipAndPorts.size() > 0) {
            ipAndPort = ipAndPorts.get(0);
            ipAndPorts.remove(0);
        } else {
            try {
                Document pageDoc = Jsoup.connect("http://www.xicidaili.com/wn/" + ipPageNum)
                        .userAgent("Mozilla")
                        .timeout(5000)
                        .get();
                Elements elements = pageDoc.select("tr.odd");
                ipPageNum ++;
                if(ipPageNum > 400){
                    ipPageNum = 1;
                }
                for(Element element : elements){
                    String[] ipPort = new String[2];
                    String ip = element.child(1).text();
                    String port = element.child(2).text();
                    String noName = element.child(4).text();
//                    if(!noName.equals("高匿")){
//                        continue;
//                    }
                    String speedStr = element.child(6).select(".bar").first().attr("title");
                    double speed = Double.valueOf(speedStr.substring(0, speedStr.indexOf("秒")));
                    String timeStr = element.child(7).select(".bar").first().attr("title");
                    double time = Double.valueOf(timeStr.substring(0, timeStr.indexOf("秒")));
                    if(speed <= 1 && time <= 1){
                        ipPort[0] = ip;
                        ipPort[1] = port;
                        ipAndPorts.add(ipPort);
                    }
                }
                return getDynamicIpAndPort();
            } catch (IOException e) {
                log.error("get DynamicIpError error info :\n {}", e);
            }
        }
        return ipAndPort;
    }
}

如上。在获取这个网页上的数据的时候会出现问题 https://cd.zu.anjuke.com/fangyuan/p1/

具体的错误有几种。
java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:150)
at java.net.SocketInputStream.read(SocketInputStream.java:121)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:703)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)

org.jsoup.HttpStatusException: HTTP error fetching URL
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:590)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)

java.io.IOException: Unable to tunnel through proxy. Proxy returns "HTTP/1.1 503 Too many open connections"
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2084)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)

java.net.SocketException: Unexpected end of file from server
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:790)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)

请各位高手帮我看下。谢谢了。在线等。急急急急急急!!!!!

  • 写回答

2条回答 默认 最新

  • threenewbee 2017-08-17 15:58
    关注

    代理服务器是匿名代理么?如果不是,还是可以被追踪到ip的。

    评论

报告相同问题?

悬赏问题

  • ¥20 java-OJ-健康体检
  • ¥15 rs485的上拉下拉,不会对a-b<-200mv有影响吗,就是接受时,对判断逻辑0有影响吗
  • ¥15 使用phpstudy在云服务器上搭建个人网站
  • ¥15 应该如何判断含间隙的曲柄摇杆机构,轴与轴承是否发生了碰撞?
  • ¥15 vue3+express部署到nginx
  • ¥20 搭建pt1000三线制高精度测温电路
  • ¥15 使用Jdk8自带的算法,和Jdk11自带的加密结果会一样吗,不一样的话有什么解决方案,Jdk不能升级的情况
  • ¥15 画两个图 python或R
  • ¥15 在线请求openmv与pixhawk 实现实时目标跟踪的具体通讯方法
  • ¥15 八路抢答器设计出现故障