我用java下载yahoo上面的日文搜索结果网页。我已经设置了编码为UTF-8,但是下载到的网页字符串和通过浏览器得到的网友不同,所有的日文字符都变成了空格。不知道哪里有问题?
下面是我使用的下载网页的代码:
[code="java"]
public static void main(String[] args) throws UnsupportedEncodingException {
String strEncoding = "UTF-8";
System.out.println(strEncoding);
String strText = getHtmlText("http://search.yahoo.com/search?ei=UTF-8&&fr=yfp-t-501&fp_ip=CN&vm=p&b=1&n=10&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=m3&vf=pdf&fl=1&vl=lang_ja&vs=&p=123+"
, 30 * 1000, strEncoding, null, null);
System.out.println(strText);
}
public static String getHtmlText(String strUrl, int timeout, String strEnCoding, String cookies, Proxy proxy) {
if (strUrl == null || strUrl.length() == 0) {
return null;
}
StringBuffer strHtml = null;
String strLine = "";
HttpURLConnection httpConnection = null;// 这里可以定义成HttpURLConnection
InputStream urlStream = null;
BufferedInputStream buff = null;
BufferedReader br = null;
boolean isError = false;
try {
//链接网络得到网页源代码
URL url = new URL(strUrl);
if (proxy != null) {
httpConnection = (HttpURLConnection) url.openConnection(proxy);
}
else {
httpConnection = (HttpURLConnection) url.openConnection();
}
httpConnection.addRequestProperty("User-Agent", "IcewolfHttp/1.0");
httpConnection.addRequestProperty("Accept",
"www/source; text/html; image/gif; */*");
httpConnection.addRequestProperty("Accept-Language", "");
if (cookies != null) {
httpConnection.setRequestProperty("Cookie", cookies);
}
httpConnection.setConnectTimeout(timeout);
httpConnection.setReadTimeout(timeout);
urlStream = httpConnection.getInputStream();
buff = new BufferedInputStream(urlStream);
Reader r = null;
if (strEnCoding == null || strEnCoding.compareTo("null") == 0) {
r = new InputStreamReader(buff);
} else {
try {
r = new InputStreamReader(buff, strEnCoding);
} catch (UnsupportedEncodingException e) {
r = new InputStreamReader(buff);
}
}
br = new BufferedReader(r);
strHtml = new StringBuffer("");
while ((strLine = br.readLine()) != null) {
strHtml.append(strLine + "\r\n");
}
}catch (java.lang.OutOfMemoryError out) {
System.out.println("内存占用:" + strHtml.capacity());
out.printStackTrace();
}
catch (Exception e) {
e.printStackTrace();
System.out.println(e.getClass() + "下载网页" + strUrl + "失败");
isError = true;
} finally{
try{
if (httpConnection != null)
httpConnection.disconnect();
if (br != null)
br.close();
if (buff != null)
buff.close();
if (urlStream != null)
urlStream.close();
}catch(Exception e){
System.out.println(e.getClass() + "下载网页" + strUrl + "连接关闭失败");
return null;
}
}
if (strHtml == null || isError)
return null;
return fromNCR(strHtml.toString());
}
[/code]