import java.io.*;
import java.util.regex.*;
/*
代码功能为实现将网页中的用户名及其发表内容摘取出来
*/
public class fileReader {
public static void main(String[] args) throws Exception{
BufferedReader br = new BufferedReader(new FileReader("d:/tiezi.txt"));
//tiezi.txt为待处理网页源码
BufferedWriter bw =
new BufferedWriter(new OutputStreamWriter(new FileOutputStream("d:/repalce"),"utf-8"));
//将替换后的源码存入repalce.txt
String line = null;
while((line=br.readLine())!=null){
while(line.contains("<img username=\"")){//用户名开始字符
//标识符<img username="与标识符" class="" src="http中间为用户名
String username = line.substring(line.indexOf("<img username=\""), line.indexOf("\" class=\"\" src=\"http"));
bw.append(username);
bw.append('#');
}
while(line.contains("class=\"d_post_content j_d_post_content clearfix\">")){
//标识符class="d_post_content j_d_post_content clearfix">与标识符
中间为用户发表内容
String usertext = line.substring(line.indexOf("class=\"d_post_content j_d_post_content clearfix\">"), line.indexOf("
"));
bw.append(usertext);
bw.newLine();
//问题:运行程序后出现文本大小一直在增加,多到15G,直至死机
}
}
bw.flush();
bw.close();
br.close();
}
}