java爬虫求助,为什么报异常了,怎么解决

package org;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;

public class RetrivePage {
private static HttpClient httpClient = new HttpClient();
// 设置代理服务器
static {
// 设置代理服务器的IP地址和端口
httpClient.getHostConfiguration().setProxy("172.17.18.84", 8080);
}

public static boolean downloadPage(String path) throws HttpException,
        IOException {
    InputStream input = null;
    OutputStream output = null;
    // 得到post方法
    PostMethod postMethod = new PostMethod(path);
    // 设置post方法的参数

    /*NameValuePair[] postData = new NameValuePair[2]; postData[0] = new
      NameValuePair("name","lietu"); postData[1] = new
      NameValuePair("password","*****");
      postMethod.addParameters(postData);
     */
    // 执行,返回状态码
    int statusCode = httpClient.executeMethod(postMethod);
    // 针对状态码进行处理 (简单起见,只处理返回值为200的状态码)
    if (statusCode == HttpStatus.SC_OK) {
        input = postMethod.getResponseBodyAsStream();
        //得到文件名
        String filename = path.substring(path.lastIndexOf('/')+1);
        //获得文件输出流
        output = new FileOutputStream(filename);
        //输出到文件
        int tempByte = -1;
        while((tempByte=input.read())>0){
            output.write(tempByte);
        }
        //关闭输入输出流
        if(input!=null){
            input.close();
        }
        if(output!=null){
            output.close();
        }
        return true;
    }
    //若需要转向,则进行转向操作
    if ((statusCode == HttpStatus.SC_MOVED_TEMPORARILY) || (statusCode == HttpStatus.SC_MOVED_PERMANENTLY) || (statusCode == HttpStatus.SC_SEE_OTHER) || (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
        //读取新的URL地址
        Header header = postMethod.getResponseHeader("location");
        if(header!=null){
            String newUrl = header.getValue();
            if(newUrl==null||newUrl.equals("")){
                newUrl="/";
                //使用post转向
                PostMethod redirect = new PostMethod(newUrl);
                //发送请求,做进一步处理。。。。。
            }
        }
    }
    return false;
}

/**
 * 测试代码
 */
public static void main(String[] args) {
    // 抓取lietu首页,输出
    try {
        RetrivePage.downloadPage("http://www.lietu.com");
    } catch (HttpException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

}

下面是异常
八月 27, 2016 4:16:28 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:16:28 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
八月 27, 2016 4:16:49 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:16:49 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
java.net.ConnectException: Connection timed out: connect
at java.net.DualStackPlainSocketImpl.connect0(Native Method)
at java.net.DualStackPlainSocketImpl.socketConnect(Unknown Source)
at java.net.AbstractPlainSocketImpl.doConnect(Unknown Source)
at java.net.AbstractPlainSocketImpl.connectToAddress(Unknown Source)
at java.net.AbstractPlainSocketImpl.connect(Unknown Source)
at java.net.PlainSocketImpl.connect(Unknown Source)
at java.net.SocksSocketImpl.connect(Unknown Source)
at java.net.Socket.connect(Unknown Source)
at java.net.Socket.connect(Unknown Source)
at java.net.Socket.(Unknown Source)
at java.net.Socket.(Unknown Source)
at org.apache.commons.httpclient.protocol.DefaultProtocolSocketFactory.createSocket(DefaultProtocolSocketFactory.java:79)
at org.apache.commons.httpclient.protocol.DefaultProtocolSocketFactory.createSocket(DefaultProtocolSocketFactory.java:121)
at org.apache.commons.httpclient.HttpConnection.open(HttpConnection.java:706)
at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:386)
at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:170)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:396)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:324)
at org.RetrivePage.downloadPage(RetrivePage.java:37)
at org.RetrivePage.main(RetrivePage.java:82)

2个回答

 信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
八月 27, 2016 4:17:10 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
java.net.ConnectException: Connection timed out: connect

很明显是连接超时了。而且你的“爬虫”程序里面没有header信息,这对于一个爬虫程序而言很容易就会被服务器所识别而干掉。

建议加一下header再尝试一下哈

楼上说得对可以参考,加上header信息,另外,如果对数据要求不是特别严格的话,可以设置下timeout的时间

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问
相关内容推荐