
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class WebCrawler {
public static void main(String[] args) {
String url = "https://news.163.com/";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)";
try {
Document doc = Jsoup.connect(url).userAgent(userAgent).get();
Elements links = doc.select("a[href]");
Set<String> allLinks = new HashSet<String>(); // 手动指定泛型类型为String
for (Element link : links) {
allLinks.add(link.attr("abs:href"));
}
ExecutorService executor = Executors.newFixedThreadPool(3);
for (final String link : allLinks) {
executor.execute(new Runnable() {
@Override
public void run() {
try {
Document innerDoc = Jsoup.connect(link).userAgent(userAgent).get();
Elements innerLinks = innerDoc.select("a[href]");
for (Element innerLink : innerLinks) {
String innerUrl = innerLink.attr("abs:href");
System.out.println("内部链接:" + innerUrl);
}
} catch (IOException e) {
e.printStackTrace();
}
}
});
}
executor.shutdown();
} catch (IOException e) {
e.printStackTrace();
}
}
}
本人正在自学jsoup爬虫 想知道这段代码里面都调用了什么方法 我刚开始学jsoup 不知道该如何下手 希望能指教一二
多谢!