Java 使用Lucene计算TFIDF分数_Java_Apache_Lucene

Java 使用Lucene计算TFIDF分数

java apache lucene

Java 使用Lucene计算TFIDF分数,java,apache,lucene,Java,Apache,Lucene,这是我的程序，用于计算文档集合中文档的TF-IDF值。这工作正常，但在计算“IDF”值（查找包含特定术语的文档的数量）时需要花费大量时间有没有更有效的方法来查找包含特定术语的文档的编号 freq = termsFreq.getTermFrequencies(); terms = termsFreq.getTerms(); int noOfTerms = terms.length; score = new float[noOfTerms]; DefaultSimilarity simi =

这是我的程序，用于计算文档集合中文档的TF-IDF值。这工作正常，但在计算“IDF”值（查找包含特定术语的文档的数量）时需要花费大量时间

有没有更有效的方法来查找包含特定术语的文档的编号

freq = termsFreq.getTermFrequencies();

terms = termsFreq.getTerms();

int noOfTerms = terms.length;
score = new float[noOfTerms];
DefaultSimilarity simi = new DefaultSimilarity();

        for (i = 0; i < noOfTerms; i++) {

            int noofDocsContainTerm = noOfDocsContainTerm(terms[i]);
            float tf = simi.tf(freq[i]);
            float idf = simi.idf(noofDocsContainTerm, noOfDocs);  
            score[i] = tf * idf ;

        }

如果您有一个术语，并且需要其文档频率，即包含此术语的文档数：Call method。它为您提供一个TermEnum对象。然后，调用方法。它提供索引中术语的文档频率。

/*
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;


/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/

/*
*
* Class contains methods for indexing documents with Lucene, and    calculating
* TFIDF weights
*/
public class DocIndexer {

private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;




/**
 * Constructor used when indexing directory is a RAM memory directory, We
 * need RAM directory because Stratoes Server dosen't allow access local
 * files
 *
 * @param pathToIndex- doc index path 
 * @param pathToDocumentCollection - doccollection path
 */
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
  //  this.docNames = docNames;

    //this.bufPathToIndex= new RandomAccessBuffer() ;
  //  this.ramMemDir = new RAMDirectory();
    this.pathToIndex = pathToIndex;
    this.pathToDocumentCollection= pathToDocumentCollection;
    // this.files = files;
   // this.filesInText = docContent;

}




/**
 * Count the number of words in a given String
 *
 * @param line- Input String
 * @return - number of words in the input String
 */
private int wordCount(String line) {
    int numWords = 0;
    int index = 0;
    boolean prevWhiteSpace = true;
    while (index < line.length()) {
        char c = line.charAt(index++);
        boolean currWhiteSpace = Character.isWhitespace(c);
        if (prevWhiteSpace && !currWhiteSpace) {
            numWords++;
        }
        prevWhiteSpace = currWhiteSpace;
    }
    return numWords;
}

/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {

    String filetext = null;
    BufferedReader reader = null;
    //BufferedReader namesReader; //reader for followers
    //Extractor extractor = new Extractor();
    File inFile = new File(filename);
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]);

    //READING FROM USERS FILE
    reader = new BufferedReader(new FileReader(inFile));
    String line = null;

    int numLine = 0;

    while ((line = reader.readLine()) != null) {
        // numLine++;
        filetext = filetext + " " + line;

        // System.out.println(line);
    }

    reader.close();
    return filetext;
}

/**
 * Method to index the documents only using the content of the document
 * "docid" field is used for indexing, since Lucene Dosen't retrieve the
 * documents in the indexed order 
 *
 * @param docNo- document number of the document to be indexed
 * @throws IOException
 */
 public void indexDocs() throws IOException {
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
   // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
    File folder = new File(pathToDocumentCollection);
    File[] listOfFiles = folder.listFiles();
    int noOfFiles = listOfFiles.length;
    System.out.println("Number of files : " + noOfFiles);

    IndexWriter iW;
    int indexDocCount = 0;
    try {
        NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
        iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));

        for (int i = 0; i < noOfFiles; i++) {
            if (listOfFiles[i].isFile()) {
                String docName = listOfFiles[i].getName();
                System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
                if (listOfFiles[i].length() > 1) {
                    String filesInText = fileReader(pathToDocumentCollection + docName);

                    //docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
                    System.out.println("Added to index : " + docName);

                    //  StringReader strRdElt = new StringReader(filesInText[i]);
                    //filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
                    //System.out.println( "Added to index : " + docName );
                    StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
                    StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
                    doc.add(new Field("docid", docId, Field.TermVector.YES));
                    iW.addDocument(doc);
                    indexDocCount++;
                }
            }
        }

        System.out.println("no of documents added to index : " + indexDocCount);

        iW.close();
        // dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();


    try {

        IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
       // IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }


    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap;
}


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
    int noOfDocs = docNames.length;
    float tfIdfScore[][] = new float[noOfDocs][];
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();


    scoreMap = tfIdfScore(noOfDocs);




    return  scoreMap;
}

*要更改此模板，请选择工具|模板
*然后在编辑器中打开模板。
*/
导入java.io.*；
导入java.util.*；
导入org.apache.lucene.analysis.Analyzer；
导入org.apache.lucene.analysis.Token；
导入org.apache.lucene.analysis.TokenStream；
导入org.apache.lucene.analysis.WhitespaceAnalyzer；
导入org.apache.lucene.analysis.standard.StandardAnalyzer；
导入org.apache.lucene.analysis.tokenattributes.TermAttribute；
导入org.apache.lucene.document.document；
导入org.apache.lucene.document.Field；
导入org.apache.lucene.index.*；
导入org.apache.lucene.queryParser.ParseException；
导入org.apache.lucene.queryParser.queryParser；
导入org.apache.lucene.search.*；
导入org.apache.lucene.store.NIOFSDirectory；
导入org.apache.lucene.util.Version；
/*
*作者变更日期2012年4月14日Kasun Perera创建
*/
/*
*
*类包含使用Lucene索引文档和计算
*TFIDF重量
*/
公共类文档索引器{
私有字符串docNames[]；
私有字符串docid[]；
私有字符串路径索引；
私有字符串路径到文档集合；
私有字符串fiboTermList[]；//已标记的fibo术语
私有字符串taxoTermList[]；//标记的分类术语
私有ramdir目录；
私有字符串文件名[]；
专用字节文件[]；
私有字符串fileintext[]；
int noOfWordsOfDOc[]；
国际noOfSentencesOfDoc[]；
ArrayList ArrlSentencesOfdoc[]；
字符串removedTermsOfDOc[]；
int frequencafterremovalofdoc[]；
//int queryDocIndex；
私家车；
专用最终整数最大项=1000000；
/**
*当索引目录是RAM内存目录时使用的构造函数，我们
*需要RAM目录，因为Stratoes服务器不允许访问本地
*档案
*
*@param pathToIndex-文档索引路径
*@param pathToDocumentCollection-doccollection路径
*/
公共文档索引器（字符串路径索引、字符串路径文档集合）{
//this.docNames=docNames；
//this.bufPathToIndex=new RandomAccessBuffer（）；
//this.ramMemDir=新的RAMDirectory（）；
this.pathToIndex=pathToIndex；
this.pathToDocumentCollection=pathToDocumentCollection；
//this.files=文件；
//this.fileintext=docContent；
}
/**
*计算给定字符串中的字数
*
*@param行-输入字符串
*@return-输入字符串中的字数
*/
私有整数字数（字符串行）{
int numWords=0；
int指数=0；
布尔值，空白=真；
while（索引1）{
字符串fileintext=fileReader（pathToDocumentCollection+docName）；
//docIds[i]=docNames[i]。子字符串（0，docName.length（）-4）；
System.out.println（“添加到索引：+docName”）；
//StringReader strRdElt=新的StringReader（fileIntext[i]）；
//filesitext=filesitext.replaceAll（“[^A-Za-z!]”，”）；
//System.out.println（“添加到索引中
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;


/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/

/*
*
* Class contains methods for indexing documents with Lucene, and    calculating
* TFIDF weights
*/
public class DocIndexer {

private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;




/**
 * Constructor used when indexing directory is a RAM memory directory, We
 * need RAM directory because Stratoes Server dosen't allow access local
 * files
 *
 * @param pathToIndex- doc index path 
 * @param pathToDocumentCollection - doccollection path
 */
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
  //  this.docNames = docNames;

    //this.bufPathToIndex= new RandomAccessBuffer() ;
  //  this.ramMemDir = new RAMDirectory();
    this.pathToIndex = pathToIndex;
    this.pathToDocumentCollection= pathToDocumentCollection;
    // this.files = files;
   // this.filesInText = docContent;

}




/**
 * Count the number of words in a given String
 *
 * @param line- Input String
 * @return - number of words in the input String
 */
private int wordCount(String line) {
    int numWords = 0;
    int index = 0;
    boolean prevWhiteSpace = true;
    while (index < line.length()) {
        char c = line.charAt(index++);
        boolean currWhiteSpace = Character.isWhitespace(c);
        if (prevWhiteSpace && !currWhiteSpace) {
            numWords++;
        }
        prevWhiteSpace = currWhiteSpace;
    }
    return numWords;
}

/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {

    String filetext = null;
    BufferedReader reader = null;
    //BufferedReader namesReader; //reader for followers
    //Extractor extractor = new Extractor();
    File inFile = new File(filename);
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]);

    //READING FROM USERS FILE
    reader = new BufferedReader(new FileReader(inFile));
    String line = null;

    int numLine = 0;

    while ((line = reader.readLine()) != null) {
        // numLine++;
        filetext = filetext + " " + line;

        // System.out.println(line);
    }

    reader.close();
    return filetext;
}

/**
 * Method to index the documents only using the content of the document
 * "docid" field is used for indexing, since Lucene Dosen't retrieve the
 * documents in the indexed order 
 *
 * @param docNo- document number of the document to be indexed
 * @throws IOException
 */
 public void indexDocs() throws IOException {
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
   // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
    File folder = new File(pathToDocumentCollection);
    File[] listOfFiles = folder.listFiles();
    int noOfFiles = listOfFiles.length;
    System.out.println("Number of files : " + noOfFiles);

    IndexWriter iW;
    int indexDocCount = 0;
    try {
        NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
        iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));

        for (int i = 0; i < noOfFiles; i++) {
            if (listOfFiles[i].isFile()) {
                String docName = listOfFiles[i].getName();
                System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
                if (listOfFiles[i].length() > 1) {
                    String filesInText = fileReader(pathToDocumentCollection + docName);

                    //docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
                    System.out.println("Added to index : " + docName);

                    //  StringReader strRdElt = new StringReader(filesInText[i]);
                    //filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
                    //System.out.println( "Added to index : " + docName );
                    StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
                    StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
                    doc.add(new Field("docid", docId, Field.TermVector.YES));
                    iW.addDocument(doc);
                    indexDocCount++;
                }
            }
        }

        System.out.println("no of documents added to index : " + indexDocCount);

        iW.close();
        // dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();


    try {

        IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
       // IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }


    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap;
}


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
    int noOfDocs = docNames.length;
    float tfIdfScore[][] = new float[noOfDocs][];
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();


    scoreMap = tfIdfScore(noOfDocs);




    return  scoreMap;
}

public double getTFIDFScoreInCollection(String FIELD, String word,IndexReader reader)
        throws IOException {

    IndexSearcher searcher = new IndexSearcher(reader);
    ClassicSimilarity similarity = new ClassicSimilarity();
    IndexReaderContext context = searcher.getTopReaderContext();
    CollectionStatistics collectionStats = searcher.collectionStatistics(FIELD);

    long totalDocCount = collectionStats.docCount();

    BytesRef ref = new BytesRef(word);

    long termFreq = this.getTermFrequencyInCollection(FIELD,word);
    float tf = similarity.tf(termFreq);

    Term term = new Term(FIELD, ref);
    TermContext termContext = TermContext.build(context, term);

    TermStatistics termStats = searcher.termStatistics(term, termContext);
    long docFreq = termStats.docFreq();
    float idf = similarity.idf(docFreq, totalDocCount);

    return tf*idf;

}

import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.util.BytesRef;