sinat_36387539
渲杭爸
采纳率0%
2016-12-15 07:11

ictclas java 对指定目录下的.txt文件进行分词预处理,同时去除停用词跟统计词频

之前我也在网上当过一些资料自己也写了一些代码但是不知道是哪里出错了一直没有理想的效果,请大神帮我看看,代码如下:package com.hygenomics.tqa.util;

public class WordSeg {
private static String testDataFolder = System.getProperty("user.dir")+"\testData";
private static String nlpirLib = System.getProperty("user.dir")+"\ictclas\WIN64\NLPIR";
private static String nlpirDataFolder = System.getProperty("user.dir")+"\ictclas";
private static String stopWordTableFile = System.getProperty("user.dir")+"\dic\StopWordTable.txt";
private static String keyWordTableFile = System.getProperty("user.dir")+"\dic\国务院公文主题词表.txt";
private static String segSrcFolder = System.getProperty("user.dir")+"\segSrc\ ";
private static String segDestFolder = System.getProperty("user.dir")+"\segDest\";

interface CLibrary extends Library {

    WordSeg.CLibrary Instance = (WordSeg.CLibrary) Native.loadLibrary("D:\program\CWordSeg[maven]\file\win64\NLPIR", WordSeg.CLibrary.class);
    public int NLPIR_Init(String sDataPath,int encoding,String sLicenceCode);
    //获取字符串分词
    public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
    // 对文本进行分词:读入文本,输出文本,是否标注词性(0为不标注,1为标注)
    public boolean NLPIR_FileProcess(String txt_input, String txt_output, int i);
    // 添加用户词汇
    public int NLPIR_AddUserWord(String sWord);
    // 删除用户词汇
    public int NLPIR_DelUsrWord(String sWord);
    // 保存用户词汇到用户词典
    public int NLPIR_SaveTheUsrDic();
    // 导入用户自定义词典:自定义词典路径,bOverwrite=true表示替代当前的自定义词典,false表示添加到当前自定义词典后
    public int NLPIR_ImportUserDict(String sFilename, boolean bOverwrite);
    //获取关键字
    public String NLPIR_GetKeyWords(String sLine,int nMaxKeyLimit,boolean bWeightOut);
    // 词频统计功能,sText为字符串文本
    public String NLPIR_WordFreqStat(String sText);
    public String NLPIR_GetLastErrorMsg();
    public void NLPIR_Exit();
}
private static String[] stopWords=new String[769];//停用词个数

private static void loadStop() throws IOException {
    //ArrayList stopwords=new ArrayList();//存放词语
    BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream("E:\\自己工作用\\中文算法\\哈工大停用词表\\StopWordTable.txt"),"utf-8"));
    String word=null;
    int count=0;
    word=fr.readLine();
    while(word!=null){
        //stopwords.add(word);
        stopWords[count]=word;
        count++;
        word=fr.readLine();
    }//省去每次加载停用词
}
private static String removeAll(String str){//去除停用等,同时去除词性标注
    String RAll="";
    String[] allWords = str.split(" ");
    for (String allWord : allWords) {
        int pos = allWord.lastIndexOf("/");
        String temp = "";
        if (pos > 0)
            temp = allWord.substring(0, pos).trim();
        if (!temp.equals(" ") && !temp.equals(" ") && !temp.equals("")) {
            RAll = RAll + temp + " ";
        }
    }
    return RAll;
}
private static String removeW(String str){//去除标点符号
    String removeW="";
    String[] allWords = str.split(" ");
    for (String allWord : allWords) {
        int pos = allWord.lastIndexOf("/");
        if (pos > 0) {
            String temp2 = allWord.substring(pos + 1, pos + 2);//词性标注
            if ( !temp2.equals("w") && !temp2.equals("") ) {
                removeW = removeW + allWord + " ";//w标点  或者为空
            }
        }
    }
    return removeW;
}
private static String removeStop(String str){//去除停用等,但是保留词性标注
    String afterStop="";
    boolean flag=true;
    String[] allWords = str.split(" ");
    for (String allWord : allWords) {
        int pos = allWord.lastIndexOf("/");
        int n;
        if (pos > 0) {
            String temp1 = allWord.substring(0, pos);//中文不包括词性标注
            String temp2 = allWord.substring(pos + 1, pos + 2);//词性标注
            flag = true;
            if (temp2.equals(" ") || temp2.equals("t") || temp1.equals("") || temp1.equals(" ") || temp2.equals("s") || temp2.equals("z")
                    || temp2.equals("d") || temp2.equals("p") || temp2.equals("c") || temp2.equals("u") || temp2.equals("e") || temp2.equals("y")
                    || temp2.equals("o") || temp2.equals("h") || temp2.equals("k") || temp2.equals("m") || temp2.equals("x") || temp2.equals("q")) {
                //t时间词,s处所词,z状态词,d副词,p介词,c连词,u助词,e叹词,y语气词,o拟声词,h前缀,k后缀,m数词,q量词,x英语等字符串
                flag = false;
            } else
                for (n = 0; n < stopWords.length; n++) {
                    if (temp1.equals(stopWords[n])) {//去除停用词
                        flag = false;
                        break;
                    }
                }
            if (flag)
                afterStop = afterStop + allWord + " ";
        }
    }
    return afterStop;
}
public static void testICTCLAS_FileProcess(String inDirectory,String OutDirectory)
{
    try
    {
        //分词所需库的路径
    String argu = "D:\\program\\CWordSeg[maven]\\file";
        int charset_type = 1;
        int init_flag = CWordSeg.CLibrary.Instance.NLPIR_Init(nlpirDataFolder, charset_type, "0");
        String nativeBytes;
        //初始化
        if (0 == init_flag) {
            nativeBytes = CWordSeg.CLibrary.Instance.NLPIR_GetLastErrorMsg();

            System.err.println("初始化失败!原因:"+nativeBytes);
        }
        int nCount = CWordSeg.CLibrary.Instance.NLPIR_ImportUserDict("E:\\自己工作用\\国务院公文主题词表txt\\国务院公文主题词表.txt",true);
        System.out.println(nCount+"个自定义词…………");
        File dirIn= new File(inDirectory);
        File dirOut = new File(OutDirectory);
        if(dirOut.exists()){
            dirOut.mkdirs();
        }
        File fileIn[] = dirIn.listFiles();
        for (int i = 0; i < fileIn.length; i++) {
            if (fileIn[i].isDirectory()){
                testICTCLAS_FileProcess(fileIn[i].getPath(),OutDirectory);
            }
            String Inputfilename=fileIn[i].getPath();
            //分词处理后输出文件名
            String Outputfilename =OutDirectory+fileIn[i].getName();
            //文件分词(第一个参数为输入文件的名,第二个参数为文件编码类型,第三个参数为是否标记词性集1 yes,0 no,第四个参数为输出文件名)
            if((CLibrary.Instance.NLPIR_FileProcess(Inputfilename,Outputfilename,1))==false){
                System.out.print(fileIn[i].getPath()+"分词失败");
            }else {
                System.out.println(fileIn[i].getPath() + "分词成功,这是第" + i + "个文档");
            }
        }
        //保存用户词典
        CWordSeg.CLibrary.Instance.NLPIR_SaveTheUsrDic();
        CWordSeg.CLibrary.Instance.NLPIR_Exit();     // 退出
    }
    catch (Exception e)
    {
        e.printStackTrace();
    }
}
private static HashMap<String, Integer> termUnitFreq(String str){
    String out="";
    HashMap<String, Integer> wordMap= new HashMap<String, Integer>();
    String[] words=str.split(" ");
    int count=words.length;
    System.out.println(count);
    String[] strStatistics=new String[count];//词
    int[] strTimes=new int[count];//词频
    for(int k=0;k<count;k++){//初始化
        strTimes[k]=0;
        strStatistics[k]="";
    }
    for (String word : words) {//
        if ( !word.equals("") && !word.equals(" ") && !word.equals(" ") ) {
            for (int j = 0; j < count; j++) {//存储着最终的统计词
                if (strStatistics[j].equals("")) {//如果最终统计词表为空则添加进去
                    strStatistics[j] = word;
                    //System.out.println(words[i]);
                    strTimes[j]++;
                    break;
                } else {
                    if (word.equals(strStatistics[j])) {//终统计词表中存在这个表里就词频数加1
                        strTimes[j]++;
                        break;
                    }
                }
            }
        }
    }
    for(int n=0;n<count;n++){
        if(!strStatistics[n].equals("")&&strTimes[n]!=0)
            wordMap.put(strStatistics[n],strTimes[n]);
        else
            break;
    }
    return wordMap;
}
public static void computeTermFrequency(String InDirectory,String OutDirectory) throws IOException{
    loadStop();
    BufferedWriter bw = null;
    File dirIn= new File(InDirectory);
    File fileIn[] = dirIn.listFiles();
    for (File aFileIn : fileIn) {
        bw = new BufferedWriter(new FileWriter(new File(OutDirectory + aFileIn.getName())));//文件名称
        String str = "";
        BufferedReader reader = new BufferedReader(new FileReader(InDirectory + aFileIn.getName()));//读取页数大于1的文件内容
        String line;
        line = reader.readLine();
        while (line != null) {
            line = removeW(line);
            line = removeStop(line);
            String temp = removeAll(line);
            str = str + " " + temp;
            line = reader.readLine();
        }
        reader.close();//

}

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

1条回答

相关推荐