Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/apache/9.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 使用Lucene计算TFIDF分数_Java_Apache_Lucene - Fatal编程技术网

Java 使用Lucene计算TFIDF分数

Java 使用Lucene计算TFIDF分数,java,apache,lucene,Java,Apache,Lucene,这是我的程序,用于计算文档集合中文档的TF-IDF值。 这工作正常,但在计算“IDF”值(查找包含特定术语的文档的数量)时需要花费大量时间 有没有更有效的方法来查找包含特定术语的文档的编号 freq = termsFreq.getTermFrequencies(); terms = termsFreq.getTerms(); int noOfTerms = terms.length; score = new float[noOfTerms]; DefaultSimilarity simi =

这是我的程序,用于计算文档集合中文档的TF-IDF值。 这工作正常,但在计算“IDF”值(查找包含特定术语的文档的数量)时需要花费大量时间

有没有更有效的方法来查找包含特定术语的文档的编号

freq = termsFreq.getTermFrequencies();

terms = termsFreq.getTerms();

int noOfTerms = terms.length;
score = new float[noOfTerms];
DefaultSimilarity simi = new DefaultSimilarity();

        for (i = 0; i < noOfTerms; i++) {

            int noofDocsContainTerm = noOfDocsContainTerm(terms[i]);
            float tf = simi.tf(freq[i]);
            float idf = simi.idf(noofDocsContainTerm, noOfDocs);  
            score[i] = tf * idf ;

        }

如果您有一个术语,并且需要其文档频率,即包含此术语的文档数:Call method。它为您提供一个TermEnum对象。然后,调用方法。它提供索引中术语的文档频率。

/*
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;


/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/

/*
*
* Class contains methods for indexing documents with Lucene, and    calculating
* TFIDF weights
*/
public class DocIndexer {

private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;




/**
 * Constructor used when indexing directory is a RAM memory directory, We
 * need RAM directory because Stratoes Server dosen't allow access local
 * files
 *
 * @param pathToIndex- doc index path 
 * @param pathToDocumentCollection - doccollection path
 */
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
  //  this.docNames = docNames;

    //this.bufPathToIndex= new RandomAccessBuffer() ;
  //  this.ramMemDir = new RAMDirectory();
    this.pathToIndex = pathToIndex;
    this.pathToDocumentCollection= pathToDocumentCollection;
    // this.files = files;
   // this.filesInText = docContent;

}




/**
 * Count the number of words in a given String
 *
 * @param line- Input String
 * @return - number of words in the input String
 */
private int wordCount(String line) {
    int numWords = 0;
    int index = 0;
    boolean prevWhiteSpace = true;
    while (index < line.length()) {
        char c = line.charAt(index++);
        boolean currWhiteSpace = Character.isWhitespace(c);
        if (prevWhiteSpace && !currWhiteSpace) {
            numWords++;
        }
        prevWhiteSpace = currWhiteSpace;
    }
    return numWords;
}

/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {

    String filetext = null;
    BufferedReader reader = null;
    //BufferedReader namesReader; //reader for followers
    //Extractor extractor = new Extractor();
    File inFile = new File(filename);
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]);

    //READING FROM USERS FILE
    reader = new BufferedReader(new FileReader(inFile));
    String line = null;

    int numLine = 0;

    while ((line = reader.readLine()) != null) {
        // numLine++;
        filetext = filetext + " " + line;

        // System.out.println(line);
    }

    reader.close();
    return filetext;
}

/**
 * Method to index the documents only using the content of the document
 * "docid" field is used for indexing, since Lucene Dosen't retrieve the
 * documents in the indexed order 
 *
 * @param docNo- document number of the document to be indexed
 * @throws IOException
 */
 public void indexDocs() throws IOException {
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
   // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
    File folder = new File(pathToDocumentCollection);
    File[] listOfFiles = folder.listFiles();
    int noOfFiles = listOfFiles.length;
    System.out.println("Number of files : " + noOfFiles);

    IndexWriter iW;
    int indexDocCount = 0;
    try {
        NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
        iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));

        for (int i = 0; i < noOfFiles; i++) {
            if (listOfFiles[i].isFile()) {
                String docName = listOfFiles[i].getName();
                System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
                if (listOfFiles[i].length() > 1) {
                    String filesInText = fileReader(pathToDocumentCollection + docName);

                    //docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
                    System.out.println("Added to index : " + docName);

                    //  StringReader strRdElt = new StringReader(filesInText[i]);
                    //filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
                    //System.out.println( "Added to index : " + docName );
                    StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
                    StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
                    doc.add(new Field("docid", docId, Field.TermVector.YES));
                    iW.addDocument(doc);
                    indexDocCount++;
                }
            }
        }

        System.out.println("no of documents added to index : " + indexDocCount);

        iW.close();
        // dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();


    try {

        IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
       // IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }


    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap;
}


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
    int noOfDocs = docNames.length;
    float tfIdfScore[][] = new float[noOfDocs][];
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();


    scoreMap = tfIdfScore(noOfDocs);




    return  scoreMap;
}
*要更改此模板,请选择工具|模板 *然后在编辑器中打开模板。 */ 导入java.io.*; 导入java.util.*; 导入org.apache.lucene.analysis.Analyzer; 导入org.apache.lucene.analysis.Token; 导入org.apache.lucene.analysis.TokenStream; 导入org.apache.lucene.analysis.WhitespaceAnalyzer; 导入org.apache.lucene.analysis.standard.StandardAnalyzer; 导入org.apache.lucene.analysis.tokenattributes.TermAttribute; 导入org.apache.lucene.document.document; 导入org.apache.lucene.document.Field; 导入org.apache.lucene.index.*; 导入org.apache.lucene.queryParser.ParseException; 导入org.apache.lucene.queryParser.queryParser; 导入org.apache.lucene.search.*; 导入org.apache.lucene.store.NIOFSDirectory; 导入org.apache.lucene.util.Version; /* *作者变更日期2012年4月14日Kasun Perera创建 */ /* * *类包含使用Lucene索引文档和计算 *TFIDF重量 */ 公共类文档索引器{ 私有字符串docNames[]; 私有字符串docid[]; 私有字符串路径索引; 私有字符串路径到文档集合; 私有字符串fiboTermList[];//已标记的fibo术语 私有字符串taxoTermList[];//标记的分类术语 私有ramdir目录; 私有字符串文件名[]; 专用字节文件[]; 私有字符串fileintext[]; int noOfWordsOfDOc[]; 国际noOfSentencesOfDoc[]; ArrayList ArrlSentencesOfdoc[]; 字符串removedTermsOfDOc[]; int frequencafterremovalofdoc[]; //int queryDocIndex; 私家车; 专用最终整数最大项=1000000; /** *当索引目录是RAM内存目录时使用的构造函数,我们 *需要RAM目录,因为Stratoes服务器不允许访问本地 *档案 * *@param pathToIndex-文档索引路径 *@param pathToDocumentCollection-doccollection路径 */ 公共文档索引器(字符串路径索引、字符串路径文档集合){ //this.docNames=docNames; //this.bufPathToIndex=new RandomAccessBuffer(); //this.ramMemDir=新的RAMDirectory(); this.pathToIndex=pathToIndex; this.pathToDocumentCollection=pathToDocumentCollection; //this.files=文件; //this.fileintext=docContent; } /** *计算给定字符串中的字数 * *@param行-输入字符串 *@return-输入字符串中的字数 */ 私有整数字数(字符串行){ int numWords=0; int指数=0; 布尔值,空白=真; while(索引1){ 字符串fileintext=fileReader(pathToDocumentCollection+docName); //docIds[i]=docNames[i]。子字符串(0,docName.length()-4); System.out.println(“添加到索引:+docName”); //StringReader strRdElt=新的StringReader(fileIntext[i]); //filesitext=filesitext.replaceAll(“[^A-Za-z!]”,”); //System.out.println(“添加到索引中
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;


/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/

/*
*
* Class contains methods for indexing documents with Lucene, and    calculating
* TFIDF weights
*/
public class DocIndexer {

private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;




/**
 * Constructor used when indexing directory is a RAM memory directory, We
 * need RAM directory because Stratoes Server dosen't allow access local
 * files
 *
 * @param pathToIndex- doc index path 
 * @param pathToDocumentCollection - doccollection path
 */
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
  //  this.docNames = docNames;

    //this.bufPathToIndex= new RandomAccessBuffer() ;
  //  this.ramMemDir = new RAMDirectory();
    this.pathToIndex = pathToIndex;
    this.pathToDocumentCollection= pathToDocumentCollection;
    // this.files = files;
   // this.filesInText = docContent;

}




/**
 * Count the number of words in a given String
 *
 * @param line- Input String
 * @return - number of words in the input String
 */
private int wordCount(String line) {
    int numWords = 0;
    int index = 0;
    boolean prevWhiteSpace = true;
    while (index < line.length()) {
        char c = line.charAt(index++);
        boolean currWhiteSpace = Character.isWhitespace(c);
        if (prevWhiteSpace && !currWhiteSpace) {
            numWords++;
        }
        prevWhiteSpace = currWhiteSpace;
    }
    return numWords;
}

/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {

    String filetext = null;
    BufferedReader reader = null;
    //BufferedReader namesReader; //reader for followers
    //Extractor extractor = new Extractor();
    File inFile = new File(filename);
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]);

    //READING FROM USERS FILE
    reader = new BufferedReader(new FileReader(inFile));
    String line = null;

    int numLine = 0;

    while ((line = reader.readLine()) != null) {
        // numLine++;
        filetext = filetext + " " + line;

        // System.out.println(line);
    }

    reader.close();
    return filetext;
}

/**
 * Method to index the documents only using the content of the document
 * "docid" field is used for indexing, since Lucene Dosen't retrieve the
 * documents in the indexed order 
 *
 * @param docNo- document number of the document to be indexed
 * @throws IOException
 */
 public void indexDocs() throws IOException {
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
   // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
    File folder = new File(pathToDocumentCollection);
    File[] listOfFiles = folder.listFiles();
    int noOfFiles = listOfFiles.length;
    System.out.println("Number of files : " + noOfFiles);

    IndexWriter iW;
    int indexDocCount = 0;
    try {
        NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
        iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));

        for (int i = 0; i < noOfFiles; i++) {
            if (listOfFiles[i].isFile()) {
                String docName = listOfFiles[i].getName();
                System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
                if (listOfFiles[i].length() > 1) {
                    String filesInText = fileReader(pathToDocumentCollection + docName);

                    //docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
                    System.out.println("Added to index : " + docName);

                    //  StringReader strRdElt = new StringReader(filesInText[i]);
                    //filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
                    //System.out.println( "Added to index : " + docName );
                    StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
                    StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
                    doc.add(new Field("docid", docId, Field.TermVector.YES));
                    iW.addDocument(doc);
                    indexDocCount++;
                }
            }
        }

        System.out.println("no of documents added to index : " + indexDocCount);

        iW.close();
        // dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();


    try {

        IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
       // IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }


    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap;
}


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
    int noOfDocs = docNames.length;
    float tfIdfScore[][] = new float[noOfDocs][];
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();


    scoreMap = tfIdfScore(noOfDocs);




    return  scoreMap;
}
public double getTFIDFScoreInCollection(String FIELD, String word,IndexReader reader)
        throws IOException {

    IndexSearcher searcher = new IndexSearcher(reader);
    ClassicSimilarity similarity = new ClassicSimilarity();
    IndexReaderContext context = searcher.getTopReaderContext();
    CollectionStatistics collectionStats = searcher.collectionStatistics(FIELD);

    long totalDocCount = collectionStats.docCount();

    BytesRef ref = new BytesRef(word);

    long termFreq = this.getTermFrequencyInCollection(FIELD,word);
    float tf = similarity.tf(termFreq);

    Term term = new Term(FIELD, ref);
    TermContext termContext = TermContext.build(context, term);

    TermStatistics termStats = searcher.termStatistics(term, termContext);
    long docFreq = termStats.docFreq();
    float idf = similarity.idf(docFreq, totalDocCount);

    return tf*idf;

}
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.util.BytesRef;