qq_35156372 2016-08-27 08:23 采纳率: 28.6%
浏览 2062

java爬虫求助,为什么报异常了,怎么解决

package org;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;

public class RetrivePage {
private static HttpClient httpClient = new HttpClient();
// 设置代理服务器
static {
// 设置代理服务器的IP地址和端口
httpClient.getHostConfiguration().setProxy("172.17.18.84", 8080);
}

public static boolean downloadPage(String path) throws HttpException,
        IOException {
    InputStream input = null;
    OutputStream output = null;
    // 得到post方法
    PostMethod postMethod = new PostMethod(path);
    // 设置post方法的参数

    /*NameValuePair[] postData = new NameValuePair[2]; postData[0] = new
      NameValuePair("name","lietu"); postData[1] = new
      NameValuePair("password","*****");
      postMethod.addParameters(postData);
     */
    // 执行,返回状态码
    int statusCode = httpClient.executeMethod(postMethod);
    // 针对状态码进行处理 (简单起见,只处理返回值为200的状态码)
    if (statusCode == HttpStatus.SC_OK) {
        input = postMethod.getResponseBodyAsStream();
        //得到文件名
        String filename = path.substring(path.lastIndexOf('/')+1);
        //获得文件输出流
        output = new FileOutputStream(filename);
        //输出到文件
        int tempByte = -1;
        while((tempByte=input.read())>0){
            output.write(tempByte);
        }
        //关闭输入输出流
        if(input!=null){
            input.close();
        }
        if(output!=null){
            output.close();
        }
        return true;
    }
    //若需要转向,则进行转向操作
    if ((statusCode == HttpStatus.SC_MOVED_TEMPORARILY) || (statusCode == HttpStatus.SC_MOVED_PERMANENTLY) || (statusCode == HttpStatus.SC_SEE_OTHER) || (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
        //读取新的URL地址
        Header header = postMethod.getResponseHeader("location");
        if(header!=null){
            String newUrl = header.getValue();
            if(newUrl==null||newUrl.equals("")){
                newUrl="/";
                //使用post转向
                PostMethod redirect = new PostMethod(newUrl);
                //发送请求,做进一步处理。。。。。
            }
        }
    }
    return false;
}

/**
 * 测试代码
 */
public static void main(String[] args) {
    // 抓取lietu首页,输出
    try {
        RetrivePage.downloadPage("http://www.lietu.com");
    } catch (HttpException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

}

下面是异常
八月 27, 2016 4:16:28 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:16:28 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
八月 27, 2016 4:16:49 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:16:49 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
java.net.ConnectException: Connection timed out: connect
at java.net.DualStackPlainSocketImpl.connect0(Native Method)
at java.net.DualStackPlainSocketImpl.socketConnect(Unknown Source)
at java.net.AbstractPlainSocketImpl.doConnect(Unknown Source)
at java.net.AbstractPlainSocketImpl.connectToAddress(Unknown Source)
at java.net.AbstractPlainSocketImpl.connect(Unknown Source)
at java.net.PlainSocketImpl.connect(Unknown Source)
at java.net.SocksSocketImpl.connect(Unknown Source)
at java.net.Socket.connect(Unknown Source)
at java.net.Socket.connect(Unknown Source)
at java.net.Socket.(Unknown Source)
at java.net.Socket.(Unknown Source)
at org.apache.commons.httpclient.protocol.DefaultProtocolSocketFactory.createSocket(DefaultProtocolSocketFactory.java:79)
at org.apache.commons.httpclient.protocol.DefaultProtocolSocketFactory.createSocket(DefaultProtocolSocketFactory.java:121)
at org.apache.commons.httpclient.HttpConnection.open(HttpConnection.java:706)
at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:386)
at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:170)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:396)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:324)
at org.RetrivePage.downloadPage(RetrivePage.java:37)
at org.RetrivePage.main(RetrivePage.java:82)

  • 写回答

2条回答 默认 最新

  • 泰 戈 尔 博客专家认证 2016-08-27 14:51
    关注
     信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
    八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
    信息: Retrying request
    java.net.ConnectException: Connection timed out: connect
    

    很明显是连接超时了。而且你的“爬虫”程序里面没有header信息,这对于一个爬虫程序而言很容易就会被服务器所识别而干掉。

    建议加一下header再尝试一下哈

    评论

报告相同问题?

悬赏问题

  • ¥15 运筹学排序问题中的在线排序
  • ¥15 关于docker部署flink集成hadoop的yarn,请教个问题 flink启动yarn-session.sh连不上hadoop,这个整了好几天一直不行,求帮忙看一下怎么解决
  • ¥30 求一段fortran代码用IVF编译运行的结果
  • ¥15 深度学习根据CNN网络模型,搭建BP模型并训练MNIST数据集
  • ¥15 lammps拉伸应力应变曲线分析
  • ¥15 C++ 头文件/宏冲突问题解决
  • ¥15 用comsol模拟大气湍流通过底部加热(温度不同)的腔体
  • ¥50 安卓adb backup备份子用户应用数据失败
  • ¥20 有人能用聚类分析帮我分析一下文本内容嘛
  • ¥15 请问Lammps做复合材料拉伸模拟,应力应变曲线问题