package com;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
- 网页数据抓取工具类 * */
public class Damo {
/**
* 获取网页源代码
* @throws IOException
* @throws IOException
*
*/
public static String getHtmlResouceByUrl(String url,String encoding) {
URL urlObj=null;
URLConnection uc=null;
InputStreamReader isr=null;
BufferedReader reader=null;
StringBuffer buffer=null;
try {
//建立网络连接
urlObj=new URL(url);
//打开网络连接
try {
uc=urlObj.openConnection();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//建立文件输入流
try {
isr=new InputStreamReader(uc.getInputStream(),encoding);
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
//建立缓存写入
reader = new BufferedReader(isr);
//临时
String temp=null;
try {
while ((temp=reader.readLine())!=null) {
// buffer.append(temp+"\n");
System.out.println(temp+"\n");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if(isr!=null){
//关闭流
try {
isr.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return buffer.toString();
}
/*
-
爬取互动值
*/
public static List> getInFo(String url,String encoding){
//拿到源代码
String html = getHtmlResouceByUrl(url, encoding);
//解析代码文件
Document document = Jsoup.parse(html);
//获取代码的具体内容
Element element = document.getElementById("1");
//获取结果
Elements elements = document.getElementsByClass("f_card_lricon");List<HashMap<String, String>> maps =new ArrayList<HashMap<String,String>>(); HashMap<String, String> map =null; for (Element el : elements) { map = new HashMap<String, String>(); //获取评论数 String comment=el.getElementsByClass("comment").text(); //获取标题 String comment1=el.getElementsByClass("f_card_h4").text(); map.put(comment, "f_card_lricon"); map.put(comment1,"f_card_h4"); maps.add(map); } return maps;
}
public static void main(String[] args) {
// String webString=getHtmlResouceByUrl("http://news.baidu.com/","utf-8");
// String webString=getHtmlResouceByUrl("http://www.163.com/","gbk");
// System.out.println(webString);
String urlString = "http://k.sina.cn/article_1707602817_65c7f381001002v4u.html?kfrome=auto&local=&subch=3&vt=4";
List> list =getInFo(urlString, "utf-8");
System.out.println(list);}
}