我用软件自动下载网页源码,作为监控网站用的,发现字数超过三万字,整个文本的汉字都成乱码,没办法分析,拿去转换编码也没办法矫正。以下是代码
con = (HttpURLConnection) url.openConnection();
con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");// IE代理进行下载
con.setConnectTimeout(120000);
con.setReadTimeout(120000);
// 获得网页返回信息码
responseCode = con.getResponseCode();
if (responseCode == -1) {
System.out.println(url.toString() + " : connection is failure...");
con.disconnect();
return null;
}
if (responseCode >= 400) //请求失败
{
System.out.println("请求失败:get response code: " + responseCode);
con.disconnect();
return null;
}
InputStream inStr = con.getInputStream();
InputStreamReader istreamReader = new InputStreamReader(inStr, encode);
BufferedReader buffStr = new BufferedReader(istreamReader);
String str = null;
while ((str = buffStr.readLine()) != null)
contentBuffer.append(str);
inStr.close();
} catch (IOException e) {
e.printStackTrace();
contentBuffer = null;
System.out.println("error: " + url.toString());
} finally {
con.disconnect();
}