Twitter 如何使用LDA查找每个主题的文档数(和分数)?

Twitter 如何使用LDA查找每个主题的文档数(和分数)?,twitter,lda,topic-modeling,mallet,Twitter,Lda,Topic Modeling,Mallet,我试图从700万推特数据中提取主题。我假设每条推文都是一份文档。因此,我将所有tweet存储在一个文件中,其中每一行(或tweet)都作为一个文档处理。我使用这个文件作为malletapi的输入文件 public static void LDAModel(int numofK,int numbofIteration,int numberofThread,String outputDir,InstanceList instances) throws Exception { // Create

我试图从700万推特数据中提取主题。我假设每条推文都是一份文档。因此,我将所有tweet存储在一个文件中,其中每一行(或tweet)都作为一个文档处理。我使用这个文件作为malletapi的输入文件

public static void LDAModel(int numofK,int numbofIteration,int numberofThread,String outputDir,InstanceList instances) throws Exception
{
   // Create a model with 100 topics, alpha_t = 0.01, beta_w = 0.01
    //  Note that the first parameter is passed as the sum over topics, while
    //  the second is the parameter for a single dimension of the Dirichlet prior.
    int numTopics = numofK;
    ParallelTopicModel model = new ParallelTopicModel(numTopics, 1.0, 0.01);

    model.addInstances(instances);

    // Use two parallel samplers, which each look at one half the corpus and combine
    //  statistics after every iteration.
    model.setNumThreads(numberofThread);

    // Run the model for 50 iterations and stop (this is for testing only, 
    //  for real applications, use 1000 to 2000 iterations)
    model.setNumIterations(numbofIteration);
    model.estimate();
    // Show the words and topics in the first instance

    // The data alphabet maps word IDs to strings
    Alphabet dataAlphabet = instances.getDataAlphabet();

    FeatureSequence tokens = (FeatureSequence) model.getData().get(0).instance.getData();
    LabelSequence topics = model.getData().get(0).topicSequence;

    Formatter out = new Formatter(new StringBuilder(), Locale.US);
    for (int position = 0; position < tokens.getLength(); position++) {
       // out.format("%s-%d ", dataAlphabet.lookupObject(tokens.getIndexAtPosition(position)), topics.getIndexAtPosition(position));
         out.format("%s-%d ", dataAlphabet.lookupObject(tokens.getIndexAtPosition(position)), topics.getIndexAtPosition(position));

    }
    System.out.println(out);

    // Estimate the topic distribution of the first instance, 
    //  given the current Gibbs state.
    double[] topicDistribution = model.getTopicProbabilities(0);

    // Get an array of sorted sets of word ID/count pairs
    ArrayList<TreeSet<IDSorter>> topicSortedWords = model.getSortedWords();

    // Show top 10 words in topics with proportions for the first document
    String topicsoutput="";
    for (int topic = 0; topic < numTopics; topic++) {
        Iterator<IDSorter> iterator = topicSortedWords.get(topic).iterator();

        out = new Formatter(new StringBuilder(), Locale.US);
        out.format("%d\t%.3f\t", topic, topicDistribution[topic]);
        int rank = 0;
        while (iterator.hasNext() && rank < 10) {
            IDSorter idCountPair = iterator.next();
            out.format("%s (%.0f) ", dataAlphabet.lookupObject(idCountPair.getID()), idCountPair.getWeight());
            //out.format("%s ", dataAlphabet.lookupObject(idCountPair.getID()));
            rank++;
        }
        System.out.println(out);

    }


    // Create a new instance with high probability of topic 0
    StringBuilder topicZeroText = new StringBuilder();
    Iterator<IDSorter> iterator = topicSortedWords.get(0).iterator();

    int rank = 0;
    while (iterator.hasNext() && rank < 10) {
        IDSorter idCountPair = iterator.next();
        topicZeroText.append(dataAlphabet.lookupObject(idCountPair.getID()) + " ");
        rank++;
    }

    // Create a new instance named "test instance" with empty target and source fields.
    InstanceList testing = new InstanceList(instances.getPipe());
    testing.addThruPipe(new Instance(topicZeroText.toString(), null, "test instance", null));

    TopicInferencer inferencer = model.getInferencer();
    double[] testProbabilities = inferencer.getSampledDistribution(testing.get(0), 10, 1, 5);
    System.out.println("0\t" + testProbabilities[0]);


    File pathDir = new File(outputDir + File.separator+ "NumofTopics"+numTopics);   //FIXME replace all strings with constants
pathDir.mkdir();
    String DirPath = pathDir.getPath();
    String stateFile = DirPath+File.separator+"output_state.gz";
    String outputDocTopicsFile = DirPath+File.separator+"output_doc_topics.txt";
    String topicKeysFile = DirPath+File.separator+"output_topic_keys";
    PrintWriter writer=null;
    String topicKeysFile_fromProgram = DirPath+File.separator+"output_topic";

    try {
        writer = new PrintWriter(topicKeysFile_fromProgram, "UTF-8");
        writer.print(topicsoutput);
        writer.close();
    } catch (Exception e) {
            e.printStackTrace();
    }

    model.printTopWords(new File(topicKeysFile), 11, false);           
    model.printDocumentTopics(new File (outputDocTopicsFile));
    model.printState(new File (stateFile));

}
 public static void main(String[] args) throws Exception{

    // Begin by importing documents from text to feature sequences
    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

    // Pipes: lowercase, tokenize, remove stopwords, map to features
    pipeList.add( new CharSequenceLowercase() );
    pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) );
    pipeList.add( new TokenSequenceRemoveStopwords(new File("H:\\Data\\stoplists\\en.txt"), "UTF-8", false, false, false) );
    pipeList.add( new TokenSequence2FeatureSequence() );
    InstanceList instances = new InstanceList (new SerialPipes(pipeList));

    Reader fileReader = new InputStreamReader(new FileInputStream(new File("E:\\Thesis Data\\DataForLDA\\freshnewData\\cleanTweets.txt")), "UTF-8");
    instances.addThruPipe(new CsvIterator (fileReader, Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"),
                                           3, 2, 1)); // data, label, name fields

    int numberofTopic=5;
    int numberofIteration=50;
    int numberofThread=6;
    String outputDir="J:\\Topics\\";

    //int numberofTopic=5;
     LDAModel(numberofTopic,numberofIteration,numberofThread,outputDir,instances); 
    TimeUnit.SECONDS.sleep(30);
    numberofTopic=10;  }       
公共静态void LDAModel(int numofK、int numbofitration、int numberofThread、String outputDir、InstanceList实例)引发异常
{
//创建一个包含100个主题的模型,alpha_t=0.01,beta_w=0.01
//请注意,第一个参数作为主题的和传递,而
//第二个是Dirichlet先验的一维参数。
int numTopics=numofK;
ParallelTopicModel=新的ParallelTopicModel(numTopics,1.0,0.01);
模型。附加说明(实例);
//使用两个平行的采样器,每个采样器查看语料库的一半并进行组合
//每次迭代后的统计数据。
model.setNumThreads(numberofThread);
//运行模型50次迭代并停止(这仅用于测试,
//对于实际应用程序,使用1000到2000次迭代)
模型。设置限制(Numofitration);
模型估计();
//首先显示单词和主题
//数据字母表将单词ID映射到字符串
Alphabet dataAlphabet=实例。getDataAlphabet();
FeatureSequence标记=(FeatureSequence)model.getData().get(0.instance.getData();
LabelSequence topics=model.getData().get(0).topicSequence;
Formatter out=new Formatter(new StringBuilder(),Locale.US);
for(int position=0;position