我想获得网页的编码,在网上找了可以用chardet.jar中的方法实现,于是下载了它,导入我的工程中,然后用如下代码进行测试:
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.HTMLCodepageDetector;
import info.monitorenter.cpdetector.io.JChardetFacade;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class WebEncoding {
private static CodepageDetectorProxy detector = CodepageDetectorProxy
.getInstance();
static {
detector.add(new HTMLCodepageDetector(false));
detector.add(JChardetFacade.getInstance());
}
/** 测试用例
- @param args
*/
public static void main(String[] args) {
WebEncoding web=new WebEncoding();
try {
System.out.println("*******************");
System.out.println(web.getCharset("http://www.baidu.com/"));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
- @param strurl
- 页面url地址,需要以 http://开始,例:http://www.pujia.com
- @return
- @throws IOException
*/
public String getCharset(String strurl) throws IOException {
// 定义URL对象
URL url = new URL(strurl);
// 获取http连接对象
HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();
;
urlConnection.connect();
// 网页编码
String strencoding = null;
/**
- 首先根据header信息,判断页面编码
*/
// map存放的是header信息(url页面的头信息)
Map> map = urlConnection.getHeaderFields();
Set keys = map.keySet();
Iterator iterator = keys.iterator();
// 遍历,查找字符编码
String key = null;
String tmp = null;
while (iterator.hasNext()) {
key = iterator.next();
tmp = map.get(key).toString().toLowerCase();
// 获取content-type charset
if (key != null && key.equals("Content-Type")) {
int m = tmp.indexOf("charset=");
if (m != -1) {
strencoding = tmp.substring(m + 8).replace("]", "");
return strencoding;
}
}
}
/**
- 通过解析meta得到网页编码
*/
// 获取网页源码(英文字符和数字不会乱码,所以可以得到正确区域)
StringBuffer sb = new StringBuffer();
String line;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream()));
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (Exception e) { // Report any errors that arise
System.err.println(e);
System.err
.println("Usage: java HttpClient []");
}
String htmlcode = sb.toString();
// 解析html源码,取出区域,并取出charset
String strbegin = "