刚开始接触爬虫,已先将网页爬虫至tizi.txt,要从中获取作者和其发表的内容,一开始用的缓存,代码如下:
import java.io.*;
public class SpiderTest01 {
public static void main(String[] args) throws IOException {
FileReader fr = null;
FileWriter fw = null;
try {
fr = new FileReader("d:/tiezi.txt");
fw = new FileWriter("d:/replace.txt");
BufferedReader bufr = new BufferedReader(fr);
BufferedWriter bufw = new BufferedWriter(fw);
String line = null;
String name = null;
String text = null;
String userNameBegin = "=utf-8\" target=\"_blank\">";
String userNameEnd = "</a>";
String userTextBegin = "class=\"d_post_content j_d_post_content clearfix\">";
String userTextEnd = "</div><br>";
while ((line = bufr.readLine()) != null) {
String userstr = null;
String textstr = null;
if (line.contains(userNameBegin) & line.contains(userNameEnd)) {
// 若一行中包含开头和结尾
name = line.substring(line.indexOf(userNameBegin) + 24,
line.indexOf(userNameEnd));
} else if (line.contains(userNameBegin)
& (line.contains(userNameEnd) == false)) {
userstr = line.substring(line.indexOf(userNameBegin));
} else if (line.contains(userNameEnd)
& (line.contains(userNameBegin) == false)) {
name = userstr
+ line.substring(0, line.indexOf(userNameEnd));
}
if(name!=null){
bufw.write(name + '#');
bufw.flush();
}
// 文本匹配
if (line.contains(userTextBegin) & line.contains(userTextEnd)) {
// 若一行中包含开头和结尾
text = line.substring(line.indexOf(userTextBegin) + 36,
line.indexOf(userTextEnd));
} else if (line.contains(userTextBegin)
& (line.contains(userTextEnd) == false)) {
textstr = line.substring(line.indexOf(userTextBegin));
} else if (line.contains(userTextEnd)
& (line.contains(userTextBegin) == false)) {
text = userstr
+ line.substring(0, line.indexOf(userTextEnd));
}
if(text!=null){
bufw.write(text);
bufw.flush();
} else {
continue;
}
}
bufr.close();
bufw.close();
} catch (IOException e) {
throw new RuntimeException("读写失败");
}
}
}
在replace文件中发现并不是想要获取的内容,在想,是不是因为readline方法的局限问题,因为标识符有可能出现在一行的结尾和下一行的开始处,故采用数组方式获取,代码如下:
import java.io.*;
public class SpiderTestDemo02 {
public static void main(String[] args) throws IOException{
FileWriter fw =null;
FileReader fr =null;
String name = null;
String text = null;
//定义作者开头及结尾标识,以及正文开始和结尾标识
String userNameBegin = "=utf-8\" target=\"_blank\">";
String userNameEnd = "</a>";
String userTextBegin = "class=\"d_post_content j_d_post_content clearfix\">";
String userTextEnd = "</div><br>";
try
{
//指定读写路径
fw = new FileWriter("d:/tizi.txt");
fr = new FileReader("d:/replace.txt");
char[] ch = new char[1024];
int len = 0;
while((len=fr.read())!=-1)//读取原文件内容至数组
{
String line = new String(ch,0,len);
if (line.contains(userNameBegin))
{
if(line.contains(userNameEnd))
{
// 若一行中包含开头和结尾
name = line.substring(line.indexOf(userNameBegin) + 24,
line.indexOf(userNameEnd));
fw.write(name+"#");
fw.flush();
}
}
if (line.contains(userTextBegin))
{
if(line.contains(userTextEnd))
{
// 若一行中包含开头和结尾
text = line.substring(line.indexOf(userTextBegin) + 36,
line.indexOf(userTextEnd));
fw.write(text);
fw.write("\r\n");
fw.flush();
}
}
if(line.contains(userNameBegin)==false&line.contains(userTextBegin)==false)
{
continue;
}
}
fr.close();
fw.close();
}catch(IOException e){
System.out.print(e);
}
}
}
结果这次运行的结果是replace文件中无任何数据,求大婶指点