2 hellojack888 hellojack888 于 2013.12.12 16:00 提问

跪求各位大侠帮忙解决LDA代码问题,小妹感激不尽!

我在网上找到了yangliuy提供的LDA Gibbs Sampling 的JAVA实现源码,但它是针对英文文档的。我在他的基础上,把英文文档换成已经分词的中文文档,并把停用词换成中文的,但运行时老是有问题。
LdaGibbsSampling代码如下:
package liuyang.nlp.lda.main;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import liuyang.nlp.lda.com.FileUtil;
import liuyang.nlp.lda.conf.ConstantConfig;
import liuyang.nlp.lda.conf.PathConfig;

/**Liu Yang's implementation of Gibbs Sampling of LDA

public class LdaGibbsSampling {

public static class modelparameters {
    float alpha = 0.5f; //usual value is 50 / K
    float beta = 0.1f;//usual value is 0.1
    int topicNum = 100;
    int iteration = 100;
    int saveStep = 10;
    int beginSaveIters = 50;
}

/**Get parameters from configuring file. If the 
 * configuring file has value in it, use the value.
 * Else the default value in program will be used
 * @param ldaparameters
 * @param parameterFile
 * @return void
 */
private static void getParametersFromFile(modelparameters ldaparameters,
        String parameterFile) {
    // TODO Auto-generated method stub
    ArrayList<String> paramLines = new ArrayList<String>();
    FileUtil.readLines(parameterFile, paramLines);
    for(String line : paramLines){
        String[] lineParts = line.split("\t");
        switch(parameters.valueOf(lineParts[0])){
        case alpha:
            ldaparameters.alpha = Float.valueOf(lineParts[1]);
            break;
        case beta:
            ldaparameters.beta = Float.valueOf(lineParts[1]);
            break;
        case topicNum:
            ldaparameters.topicNum = Integer.valueOf(lineParts[1]);
            break;
        case iteration:
            ldaparameters.iteration = Integer.valueOf(lineParts[1]);
            break;
        case saveStep:
            ldaparameters.saveStep = Integer.valueOf(lineParts[1]);
            break;
        case beginSaveIters:
            ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]);
            break;
        }
    }
}

public enum parameters{
    alpha, beta, topicNum, iteration, saveStep, beginSaveIters;
}

/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub
    String originalDocsPath = PathConfig.ldaDocsPath;
    String resultPath = PathConfig.LdaResultsPath;
    String parameterFile= ConstantConfig.LDAPARAMETERFILE;

    modelparameters ldaparameters = new modelparameters();
    getParametersFromFile(ldaparameters, parameterFile);
    Documents docSet = new Documents();
    docSet.readDocs(originalDocsPath);
    System.out.println("wordMap size " + docSet.termToIndexMap.size());
    FileUtil.mkdir(new File(resultPath));
    LdaModel model = new LdaModel(ldaparameters);
    System.out.println("1 Initialize the model ...");
    model.initializeModel(docSet);
    System.out.println("2 Learning and Saving the model ...");
    model.inferenceModel(docSet);
    System.out.println("3 Output the final model ...");
    model.saveIteratedModel(ldaparameters.iteration, docSet);
    System.out.println("Done!");
}

}
Documents代码如下:
package liuyang.nlp.lda.main;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import liuyang.nlp.lda.com.FileUtil;
import liuyang.nlp.lda.com.Stopwords;

/**Class for corpus which consists of M documents

public class Documents {

ArrayList<Document> docs; 
Map<String, Integer> termToIndexMap;
ArrayList<String> indexToTermMap;
Map<String,Integer> termCountMap;

public Documents(){
    docs = new ArrayList<Document>();
    termToIndexMap = new HashMap<String, Integer>();
    indexToTermMap = new ArrayList<String>();
    termCountMap = new HashMap<String, Integer>();
}

public void readDocs(String docsPath){
    for(File docFile : new File(docsPath).listFiles()){
        Document doc = new Document(docFile.getAbsolutePath(), termToIndexMap, indexToTermMap, termCountMap);
        docs.add(doc);
    }
}

public static class Document {  
    private String docName;
    int[] docWords;

    public Document(String docName, Map<String, Integer> termToIndexMap, ArrayList<String> indexToTermMap, Map<String, Integer> termCountMap){
        this.docName = docName;
        //Read file and initialize word index array
        ArrayList<String> docLines = new ArrayList<String>();
        ArrayList<String> words = new ArrayList<String>();
        FileUtil.readLines(docName, docLines);
        for(String line : docLines){
            FileUtil.tokenizeAndLowerCase(line, words);
        }
        //Remove stop words and noise words
        for(int i = 0; i < words.size(); i++){
            if(Stopwords.isStopword(words.get(i)) || isNoiseWord(words.get(i))){
                words.remove(i);
                i--;
            }
        }
        //Transfer word to index
        this.docWords = new int[words.size()];
        for(int i = 0; i < words.size(); i++){
            String word = words.get(i);
            if(!termToIndexMap.containsKey(word)){
                int newIndex = termToIndexMap.size();
                termToIndexMap.put(word, newIndex);
                indexToTermMap.add(word);
                termCountMap.put(word, new Integer(1));
                docWords[i] = newIndex;
            } else {
                docWords[i] = termToIndexMap.get(word);
                termCountMap.put(word, termCountMap.get(word) + 1);
            }
        }
        words.clear();
    }

    public boolean isNoiseWord(String string) {
        // TODO Auto-generated method stub
        string = string.toLowerCase().trim();
        Pattern MY_PATTERN = Pattern.compile(".*[a-zA-Z]+.*");
        Matcher m = MY_PATTERN.matcher(string);
        // filter @xxx and URL
        if(string.matches(".*www\\..*") || string.matches(".*\\.com.*") || 
                string.matches(".*http:.*") )
            return true;
        if (!m.matches()) {
            return true;
        } else
            return false;
    }

}

}
FileUtil代码如下:
package liuyang.nlp.lda.com;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;

public class FileUtil {

public static void readLines(String file, ArrayList<String> lines) {
    BufferedReader reader = null;

    try {

        reader = new BufferedReader(new FileReader(new File(file)));

        String line = null;
        while ((line = reader.readLine()) != null) {
            lines.add(line);
        }

    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

public static void writeLines(String file, HashMap<?, ?> hashMap) {
    BufferedWriter writer = null;
    try {
        writer = new BufferedWriter(new FileWriter(new File(file)));

        Set<?> s = hashMap.entrySet();
        Iterator<?> it = s.iterator();
        while (it.hasNext()) {
            Map.Entry m = (Map.Entry) it.next();
            writer.write(m.getKey() + "\t" + m.getValue() + "\n");
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

public static void writeLines(String file, ArrayList<?> counts) {
    BufferedWriter writer = null;

    try {

        writer = new BufferedWriter(new FileWriter(new File(file)));

        for (int i = 0; i < counts.size(); i++) {
            writer.write(counts.get(i) + "\n");
        }

    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

public static void writeLines(String file, ArrayList<String> uniWordMap,
        ArrayList<Integer> uniWordMapCounts) {
    BufferedWriter writer = null;

    try {

        writer = new BufferedWriter(new FileWriter(new File(file)));

        for (int i = 0; i < uniWordMap.size()
                || i < uniWordMapCounts.size(); i++) {
            writer.write(uniWordMap.get(i) + "\t" + uniWordMapCounts.get(i)
                    + "\n");
        }

    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

@SuppressWarnings("unchecked")
public static void writeLinesSorted(String file, ArrayList<?> uniWordMap,
        ArrayList<?> uniWordMapCounts, int flag) {
    // flag = 0 decreasing order otherwise increasing
    HashMap map = new HashMap();
    if (uniWordMap.size() != uniWordMapCounts.size()) {
        System.err.println("Array sizes are not equal!!! Function returned.");
    } else {
        for (int i = 0; i < uniWordMap.size(); i++) {
            map.put(uniWordMap.get(i), uniWordMapCounts.get(i));
        }
        map = (HashMap<String, Integer>) ComUtil.sortByValue(map, flag);
        writeLines(file, map);
        map.clear();
    }
}

public static void tokenize(String line, ArrayList<String> tokens) {
    StringTokenizer strTok = new StringTokenizer(line);
    while (strTok.hasMoreTokens()) {
        String token = strTok.nextToken();
        tokens.add(token);
    }
}

public static void print(ArrayList<?> tokens) {
    for (int i = 0; i < tokens.size(); i++) {
        System.out.print(tokens.get(i) + " ");
    }
    System.out.print("\n");
}

// HashMap Operations
public static void printHash(HashMap<String, Integer> hashMap) {
    Set<?> s = hashMap.entrySet();
    Iterator<?> it = s.iterator();
    while (it.hasNext()) {
        Map.Entry m = (Map.Entry) it.next();
        System.out.println(m.getKey() + "\t" + m.getValue());
    }
}

public static ArrayList<String> getHashMap(HashMap<?, ?> hm) {
    ArrayList<String> a = new ArrayList<String>();
    Set<?> s = hm.entrySet();
    Iterator<?> it = s.iterator();
    while (it.hasNext()) {
        Map.Entry m = (Map.Entry) it.next();
        a.add(m.getKey() + "\t" + m.getValue());
    }
    return a;
}

public static String getKeysFromValue(HashMap<Integer, String> hm,
        String value) {
    Set<?> s = hm.entrySet();
    // Move next key and value of HashMap by iterator
    Iterator<?> it = s.iterator();
    while (it.hasNext()) {
        // key=value separator this by Map.Entry to get key and value
        Map.Entry m = (Map.Entry) it.next();
        if (m.getValue().equals(value))
            return m.getKey() + "";
    }
    System.err.println("Error, can't find the data in Hashmap!");
    return null;
}

public static void readHash(String type_map, HashMap<String, String> typeMap) {

    ArrayList<String> types = new ArrayList<String>();
    ArrayList<String> tokens = new ArrayList<String>();

    if (type_map != null) {
        readLines(type_map, types);
        for (int i = 0; i < types.size(); i++) {
            if (!types.get(i).isEmpty()) {
                FileUtil.tokenize(types.get(i), tokens);
                if (tokens.size() != 0) {
                    if (tokens.size() != 2) {
                        for (int j = 0; j < tokens.size(); j++) {
                            System.out.print(tokens.get(j) + " ");
                        }
                        System.err
                                .println(type_map
                                        + " Error ! Not two elements in one line !");
                        return;
                    }
                    if (!typeMap.containsKey(tokens.get(0)))
                        typeMap.put(tokens.get(0), tokens.get(1));
                    else {
                        System.out.println(tokens.get(0) + " "
                                + tokens.get(1));
                        System.err.println(type_map
                                + " Error ! Same type in first column !");
                        return;
                    }
                }
                tokens.clear();
            }
        }
    }
}

public static void readHash2(String type_map,
        HashMap<String, Integer> hashMap) {

    ArrayList<String> types = new ArrayList<String>();
    ArrayList<String> tokens = new ArrayList<String>();

    if (type_map != null) {
        readLines(type_map, types);
        for (int i = 0; i < types.size(); i++) {
            if (!types.get(i).isEmpty()) {
                FileUtil.tokenize(types.get(i), tokens);
                if (tokens.size() != 0) {
                    if (tokens.size() != 2) {
                        for (int j = 0; j < tokens.size(); j++) {
                            System.out.print(tokens.get(j) + " ");
                        }
                        System.err
                                .println(type_map
                                        + " Error ! Not two elements in one line !");
                        return;
                    }
                    if (!hashMap.containsKey(tokens.get(0)))
                        hashMap.put(tokens.get(0),
                                new Integer(tokens.get(1)));
                    else {
                        System.out.println(tokens.get(0) + " "
                                + tokens.get(1));
                        System.err.println(type_map
                                + " Error ! Same type in first column !");
                        return;
                    }
                }
                tokens.clear();
            }
        }
    }
}

public static void readHash3(String type_map,
        HashMap<String, Integer> hashMap) {

    ArrayList<String> types = new ArrayList<String>();
    ArrayList<String> tokens = new ArrayList<String>();

    if (type_map != null) {
        readLines(type_map, types);
        for (int i = 0; i < types.size(); i++) {
            if (!types.get(i).isEmpty()) {
                FileUtil.tokenize(types.get(i), tokens);
                if (tokens.size() != 0) {
                    if (tokens.size() < 2) {
                        for (int j = 0; j < tokens.size(); j++) {
                            System.out.print(tokens.get(j) + " ");
                        }
                        System.err
                                .println(type_map
                                        + " Error ! Not two elements in one line !");
                        return;
                    }
                    String key = tokens.get(0);
                    String value = tokens.get(tokens.size() - 1);
                    for (int no = 1; no < tokens.size() - 1; no++) {
                        key += " " + tokens.get(no);
                    }
                    if (!hashMap.containsKey(key))
                        hashMap.put(key, new Integer(value));
                    else {
                        System.out.println(key + " " + value);
                        System.err.println(type_map
                                + " Error ! Same type in first column !");
                        return;
                    }
                }
                tokens.clear();
            }
        }
    }
}

/**
 * Create a directory by calling mkdir();
 * 
 * @param dirFile
 */
public static void mkdir(File dirFile) {
    try {
        // File dirFile = new File(mkdirName);
        boolean bFile = dirFile.exists();
        if (bFile == true) {
            System.err.println("The folder exists.");
        } else {
            System.err
                    .println("The folder do not exist,now trying to create a one...");
            bFile = dirFile.mkdir();
            if (bFile == true) {
                System.out.println("Create successfully!");
            } else {
                System.err
                        .println("Disable to make the folder,please check the disk is full or not.");
            }
        }
    } catch (Exception err) {
        System.err.println("ELS - Chart : unexpected error");
        err.printStackTrace();
    }
}

public static void mkdir(File file, boolean b) {
    if(b) {// true delete first
        deleteDirectory(file);
        mkdir(file);
    } else {
        mkdir(file);
    }
}

/**
 * 
 * @param path
 * @return
 */
static public boolean deleteDirectory(File path) {
    if (path.exists()) {
        File[] files = path.listFiles();
        for (int i = 0; i < files.length; i++) {
            if (files[i].isDirectory()) {
                deleteDirectory(files[i]);
            } else {
                files[i].delete();
            }
        }
    }
    return (path.delete());
}

/**
 * List files in a given directory
 * 
 * */
static public String[] listFiles(String inputdir) {
    File dir = new File(inputdir);

    String[] children = dir.list();
    if (children == null) {
        // Either dir does not exist or is not a directory
    } else {
        for (int i = 0; i < children.length; i++) {
            // Get filename of file or directory
            String filename = children[i];
        }
    }

    return children;
}

/**
 * List files in a given directory
 * 
 * */
static public String[] listFilteredFiles(String inputdir,
        final String filterCondition) {
    File dir = new File(inputdir);

    String[] children = dir.list();
    // It is also possible to filter the list of returned files.
    // This example does not return any files that start with `.'.
    FilenameFilter filter = new FilenameFilter() {
        public boolean accept(File dir, String name) {
            return name.endsWith(filterCondition);
        }
    };
    children = dir.list(filter);

    return children;
}

/**
 * List files recursively in a given directory
 * 
 * */
static public void listFilesR() {
    File dir = new File("directoryName");

    String[] children = dir.list();

    // The list of files can also be retrieved as File objects
    File[] files = dir.listFiles();

    // This filter only returns directories
    FileFilter fileFilter = new FileFilter() {
        public boolean accept(File file) {
            return file.isDirectory();
        }
    };
    files = dir.listFiles(fileFilter);

}

/**
 * Frequently used functions
 * */
static public int count(String a, String contains) {
    int i = 0;
    int count = 0;
    while (a.contains(contains)) {
        i = a.indexOf(contains);
        a = a.substring(0, i)
                + a.substring(i + contains.length(), a.length());
        count++;
    }
    return count;
}

public static void print(String[] files) {

    for (int i = 0; i < files.length; i++) {
        System.out.print(files[i] + " ");
    }
    System.out.print("\n");
}

public static void print(int[] c1) {
    for (int i = 0; i < c1.length; i++) {
        System.out.print(c1[i] + " ");
    }
    System.out.println();
}

public static void test() {
    String a = "fdsfdsaf";
    a += "\nfdsaf fd fd";
    a += "\nfd sf fd fd\n";
    System.out.println(a);
    a = a.replaceAll("\n+", " ");
    System.out.println(a);
    System.exit(0);
}

public static void readHash(String type_map, HashMap<String, String> typeMap, boolean flag) {

    ArrayList<String> types = new ArrayList<String>();
    ArrayList<String> tokens = new ArrayList<String>();

    if(type_map != null) {
        readLines(type_map, types);
        for (int i = 0; i < types.size(); i++) {
            if (!types.get(i).isEmpty()) {
                FileUtil.tokenize(types.get(i), tokens);
                if(tokens.size() != 0) {
                    if (tokens.size() != 2) {
                        for(int j = 0; j < tokens.size(); j++) {
                            System.out.print(tokens.get(j)+" ");
                        }
                        System.err
                                .println(type_map + " Error ! Not two elements in one line !");
                        return;
                    }
                    String tokens0 = "";
                    String tokens1 = "";
                    if(flag) {
                        tokens0 = tokens.get(0).trim();
                        tokens1 = tokens.get(1).trim();
                    } else {
                        tokens0 = tokens.get(1).trim();
                        tokens1 = tokens.get(0).trim();
                    }
                    if (!typeMap.containsKey(tokens0))
                        typeMap.put(tokens0, tokens1);
                    else {
                        System.err.println(tokens0 + " " + tokens1);
                        System.err
                                .println(type_map + " Ignore this one ! Same type in first column !");
                    }
                }
                tokens.clear();
            }
        }
    }
}

public static String filter4tokenization(String inputstring) {

// inputstring = "fds fds Won't won't can't Can't ain't";
// aggregate common tokenization error
inputstring = inputstring.replaceAll("(?i)won't", "will not");
inputstring = inputstring.replaceAll("(?i)can't", "can not");
inputstring = inputstring.replaceAll("(?i)shan't", "shall not");
inputstring = inputstring.replaceAll("(?i)ain't", "am not");
return inputstring;
}

public static void tokenizeAndLowerCase(String line, ArrayList<String> tokens) {
    // TODO Auto-generated method stub
    StringTokenizer strTok = new StringTokenizer(line);
    while (strTok.hasMoreTokens()) {
        String token = strTok.nextToken();
        tokens.add(token.toLowerCase().trim());
    }
}

}

运行结果显示:The folder exists.
wordMap size 0
1 Initialize the model ...
2 Learning and Saving the model ...
Iteration 0
Iteration 1
Iteration 2
Saving model at iteration 80 ...
Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 0, Size: 0
at java.util.ArrayList.rangeCheck(ArrayList.java:604)
at java.util.ArrayList.get(ArrayList.java:382)
at liuyang.nlp.lda.main.LdaModel.saveIteratedModel(LdaModel.java:228)
at liuyang.nlp.lda.main.LdaModel.inferenceModel(LdaModel.java:102)
at liuyang.nlp.lda.main.LdaGibbsSampling.main(LdaGibbsSampling.java:89)。
请问是什么原因呢?

2个回答

u013156531
u013156531   2013.12.14 16:50

重新编辑下,或许软件问题 符号等

ll861227
ll861227   2014.05.06 09:26

您好,我最近在研究LDA,现在只能对英文进行建模,想请问您现在对中文的处理进行的怎么样了

Csdn user default icon
上传中...
上传图片
插入图片
准确详细的回答,更有利于被提问者采纳,从而获得C币。复制、灌水、广告等回答会被删除,是时候展现真正的技术了!