攻城小弟 2019-08-11 11:32 采纳率: 0%
浏览 1136

JAVA爬虫 爬取 https://www.zhipin.com/?ka=header-home 出现问题

JAVA爬虫 爬取 https://www.zhipin.com/?ka=header-home 出现问题 ,自己使用了ip代理

public static Elements getJsoupDocGet(String url) throws  IOException {
        Document doc = null;
        Random random = new Random();
        int i = 0;
        while (i < 10 && doc == null) {
            try {
                trustEveryone();
                IpProxy ipProxy = getRandomIP();
                System.out.println(ipProxy.getIp()+"   "+ipProxy.getPort());
                System.setProperty("http.maxRedirects", "50");
                System.getProperties().setProperty("proxySet", "true");
                System.getProperties().setProperty("http.proxyHost", ipProxy.getIp());
                System.getProperties().setProperty("http.proxyPort", ipProxy.getPort());
                doc =  
                        Jsoup.connect(url)
                        .userAgent(getUserAgent(ua))
                        .header("Accept-Encoding", "gzip, deflate,br")
                        .header("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
                        .header("User-Agent", "Mozilla/31.0 (compatible; MSIE 10.0; Windows NT; DigExt)")
                        .header("Content-Type", "application/x-www-form-urlencoded")
                        .header("Cache-Control","no-cache" )
                        .header("Pragma", "no-cache")
                        .header("Upgrade-Insecure-Requests", "1")
                        .header("Connection", "Keep-alive")
                        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                        .ignoreContentType(true)
                        .ignoreHttpErrors(true)
                        .timeout(1000 * 60)
                        .cookie("auth", "token")
                        .post();
                if (doc != null) {
                    System.out.println("代理成功");
                    //System.out.println(doc.toString());
                    return doc.select("body");
                }
            } catch (Exception e) {

            }
            i++;
        }

        if (doc == null) {
                doc = Jsoup.connect(url).header("Accept", "*/*").header("Accept-Encoding", "gzip, deflate")
                        .header("Accept-Encoding", "gzip, deflate")
                        .header("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
                        .header("User-Agent", "Mozilla/31.0 (compatible; MSIE 10.0; Windows NT; DigExt)")
                        .header("Content-Type", "application/x-www-form-urlencoded")
                        .header("Cache-Control","no-cache" )
                        .header("Pragma", "no-cache")
                        .header("Upgrade-Insecure-Requests", "1")
                        .header("Connection", "Keep-alive")
                        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                        .timeout(1000 * 60)
                        .userAgent(getUserAgent(ua))
                        .cookie("auth", "token")
                        .post();
        }
            return doc.select("body");
    }

爬取网站时出现了下面的问题
但是有浏览器可以正常看,且代码也是完整的

代理成功
<body> 
 <div style="font-size: 12px;">
  请稍后...
 </div> 
 <input type="hidden" id="page_key_name" value="security_bridge"> 
 <script>
        var _T=_T||[];!function(){var b,a=document.createElement("script");a.src="https://static.zhipin.com/library/js/analytics/ka.js",b=document.getElementsByTagName("script")[0],b.parentNode.insertBefore(a,b)}(),function(){var e,f,g,h,a=function(a,b){var c,d,e,f;"object"!=typeof a&&(a=[a]),c=document.getElementsByTagName("head").item(0)||document.documentElement,d=new Array,e=a.length-1,f=function(g){d[g]=document.createElement("script"),d[g].setAttribute("type","text/javascript"),d[g].setAttribute("charset","UTF-8"),d[g].onload=d[g].onreadystatechange=function(){this.onload=this.onreadystatechange=null,this.parentNode.removeChild(this),g!=e?f(g+1):"function"==typeof b&&b()},d[g].setAttribute("src",a[g]),c.appendChild(d[g])},f(0)},b=function(a){var b=new RegExp("(^|&)"+a+"=([^&]*)(&|$)"),c=window.location.search.substr(1).match(b);return null!=c?unescape(c[2]):null},c={get:function(a){var b,c=new RegExp("(^| )"+a+"=([^;]*)(;|$)");return(b=document.cookie.match(c))?unescape(b[2]):null},set:function(a,b,c,d,e){var g,f=a+"="+encodeURIComponent(b);c&&(g=new Date(c).toGMTString(),f+=";expires="+g),f=d?f+";domain="+d:f,f=e?f+";path="+e:f,document.cookie=f}};window.location.href,e=decodeURIComponent(b("seed"))||"",f=b("ts"),g=b("name"),h=decodeURIComponent(b("callbackUrl")),e&&f&&g&&a("security-js/"+g+".js",function(){var a=(new Date).getTime()+2304e5,b=".zhipin.com",d=(new ABC).z(e,parseInt(f));window.location.host.indexOf(".weizhipin.com")>-1&&(b=".weizhipin.com"),c.set("__zp_stoken__",d,a,b,"/");try{_T.sendEvent("security_bridge_"+d)}catch(g){}h?window.location.href=h:window.history.back()})}();
    </script>  
</body>

求大佬给出解答方式,求救救,自己在网上搜的设置ip代理代码,应该没错,别的网页可以,只有自己爬取的网页有问题

  • 写回答

2条回答

  • threenewbee 2019-08-11 11:37
    关注

    抓包看下你的http请求对不对,还有你的代理是不是匿名代理,如果不是,你的本机ip还是会通过x-forward-for传给服务器

    评论

报告相同问题?

悬赏问题

  • ¥15 Jenkins+k8s部署slave节点offline
  • ¥15 微信小游戏反编译后,出现找不到分包的情况
  • ¥15 如何实现从tello无人机上获取实时传输的视频流,然后将获取的视频通过yolov5进行检测
  • ¥15 WPF使用Canvas绘制矢量图问题
  • ¥15 用三极管设计一个单管共射放大电路
  • ¥15 孟德尔随机化r语言运行问题
  • ¥15 pyinstaller编译的时候出现No module named 'imp'
  • ¥15 nirs_kit中打码怎么看(打码文件是csv格式)
  • ¥15 怎么把多于硬盘空间放到根目录下
  • ¥15 Matlab问题解答有两个问题