我使用httpclient来爬去固定一些网站的数据信息,由于使用了线程池因此在抓取的时候经常碰到有异常!
Exception in thread "pool-226-thread-200" java.lang.IllegalStateException: Invalid use of SingleClientConnManager: connection still allocated.
Make sure to release the connection before allocating another one.
at org.apache.http.impl.conn.SingleClientConnManager.getConnection(SingleClientConnManager.java:199)
at org.apache.http.impl.conn.SingleClientConnManager$1.getConnection(SingleClientConnManager.java:173)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:390)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:641)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:576)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:554)
at com.hc.HttpClientFactory.httpGet(HttpClientFactory.java:127)
at com.hc.Fetcher.fetchType(Fetcher.java:449)
at com.hc.Fetcher.fetchType(Fetcher.java:466)
at com.hc.Fetcher.fetchType(Fetcher.java:466)
at com.hc.Fetcher.httpByCode(Fetcher.java:286)
at com.hc.Fetcher.FetchHtml(Fetcher.java:72)
at com.thread.FetchHtml.fetchStart(FetchHtml.java:64)
at com.thread.FetchHtml.run(FetchHtml.java:209)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
这里有我的线程池的实现:
logger.debug("启动");
// 获得抓取配置数据
List songsList = songsService.getSongs();
List provList = provService.getProvs();
if (0 < songsList.size() && 0 < provList.size()) {
// 构造一个线程池
ThreadPoolExecutor // 构造一个线程池
producerPool = new ThreadPoolExecutor(40, 80, 1, TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(songsList.size() * provList.size()),new ThreadPoolExecutor.CallerRunsPolicy());
// 循环开始抓取数据
for (Object objS : songsList) {
// 得到单条彩铃信息
Map sMap = (Map) objS;
for (Object objP : provList) {
// 得到单个省份抓取配置信息
Map pMap = (Map) objP;
if (sMap.get("TELTYPE").equals(pMap.get("TELTYPE"))) {
FetchHtml fhFetchHtml = new FetchHtml(sMap, pMap);
producerPool.execute(fhFetchHtml);
}
}
}
// 停止线程
producerPool.shutdown();
}
logger.debug("结束");
这里是我httpclient的配置信息:
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import com.anal.UtilComm;public class HttpClientFactory {
private static final String CHARSET_GBK = "GBK"; /** * 异常自动恢复处理 * 使用HttpRequestRetryHandler接口实现请求的异常恢复 */ private static HttpRequestRetryHandler requestRetryHandler = new HttpRequestRetryHandler() { // 自定义的恢复策略 public synchronized boolean retryRequest(IOException exception, int executionCount, HttpContext context) { // 设置恢复策略,在发生异常时候将自动重试3次 if (executionCount > 3) { // 超过最大次数则不需要重试 return false; } if (exception instanceof NoHttpResponseException) { // 服务停掉则重新尝试连接 return true; } if (exception instanceof SSLHandshakeException) { // SSL异常不需要重试 return false; } HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST); boolean idempotent = (request instanceof HttpEntityEnclosingRequest); if (!idempotent) { // 请求内容相同则重试 return true; } return false; } }; /** * 使用ResponseHandler接口处理响应 * HttpClient使用ResponseHandler会自动管理连接的释放 * 解决了对连接的释放管理 */ private static ResponseHandler<String> responseHandler = new ResponseHandler<String>() { // 自定义响应处理 public synchronized String handleResponse(HttpResponse response) throws ClientProtocolException, IOException { HttpEntity entity = response.getEntity(); if (entity != null) { String charset = EntityUtils.getContentCharSet(entity) == null ? CHARSET_GBK : EntityUtils.getContentCharSet(entity); return new String(EntityUtils.toByteArray(entity), charset); } else { return null; } } }; /** * 获取DefaultHttpClient实例 * * @param charset * 参数编码集, 可空 * @return DefaultHttpClient 对象 */ public static DefaultHttpClient getDefaultHttpClient(final String charset){ DefaultHttpClient httpclient = new DefaultHttpClient(); ArrayList headers = new ArrayList(); headers.add(new BasicHeader("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*")); headers.add(new BasicHeader("Accept-Language", "zh-cn,en-us,zh-tw,en-gb,en;")); headers.add(new BasicHeader("Accept-Charset","gbk,gb2312,utf-8,BIG5,ISO-8859-1;")); headers.add(new BasicHeader("Connection","Close")); headers.add(new BasicHeader("Cache-Control","no-cache")); headers.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; CIBA)")); httpclient.getParams().setParameter("http.default-headers", headers); //设置http头信息 httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1); //模拟浏览器,解决一些服务器程序只允许浏览器访问的问题 httpclient.getParams().setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, charset == null ? HTTP.UTF_8 : charset); httpclient.getParams().setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,30000); httpclient.getParams().setIntParameter(CoreConnectionPNames.SO_TIMEOUT,60000); httpclient.setHttpRequestRetryHandler(requestRetryHandler); return httpclient; } /** * get方式提交抓取网页 * * @param url * @param charset * @throws IOException * @throws ClientProtocolException * @throws IOException */ public static String httpGet(HttpClient httpClient, String url, String charset) throws ClientProtocolException, IOException { HttpGet httpget = new HttpGet(url); String content = null; // 发送请求,得到响应 HttpResponse response = httpClient.execute(httpget); HttpEntity entity = response.getEntity(); if (entity != null && HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) { charset = EntityUtils.getContentCharSet(entity) == null ? CHARSET_GBK : EntityUtils.getContentCharSet(entity); content = UtilComm.getString(entity.getContent(),charset); } abortRequest(httpget); return content; } /** * post方式提交抓取网页 * * @param url * @param charset * @throws IOException * @throws ClientProtocolException */ public static String httpPost(HttpClient httpClient, String url, String charset) throws ClientProtocolException, IOException { HttpPost httppost = new HttpPost(url); // 得到提交的POST值 List<NameValuePair> nvpsList = UtilComm.getNameValuePairs(url); httppost.setEntity(new UrlEncodedFormEntity(nvpsList, charset)); // 得到返回值 String content = null; HttpResponse response = httpClient.execute(httppost); HttpEntity entity = response.getEntity(); if (entity != null && HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) { charset = EntityUtils.getContentCharSet(entity) == null ? CHARSET_GBK : EntityUtils.getContentCharSet(entity); content = UtilComm.getString(entity.getContent(),charset); } abortRequest(httppost); return content; } /** * 释放HttpClient连接 * * @param hrb * 请求对象 * @param httpclient * client对象 */ public static void abortRequest(final HttpRequestBase hrb){ if (hrb != null && hrb.isAborted()) { hrb.abort(); } } public static void shutdown(final HttpClient httpclient) { if (httpclient != null) { httpclient.getConnectionManager().shutdown(); } }
}
经常碰到这类异常,帮忙看看,什么原因造成的呢?