Java Lucene:-索引和查找唯一术语
我用lucene编写了一段代码,它首先为xml文档编制索引,并在索引中查找唯一术语的数量 假设有n个(数量)唯一术语 我想生成一个维数为nXn的矩阵,其中Java Lucene:-索引和查找唯一术语,java,lucene,memory-efficient,Java,Lucene,Memory Efficient,我用lucene编写了一段代码,它首先为xml文档编制索引,并在索引中查找唯一术语的数量 假设有n个(数量)唯一术语 我想生成一个维数为nXn的矩阵,其中 m[i][j] = (co_occurrence value of terms (i, j))/ (occurrence value of term i) 术语(i,j)的出现次数=第i个术语和第j个术语同时出现的文件数量 术语j的出现是指出现术语j的文档数 我的代码运行良好。但这并不高效。对于大量文件,其中术语数量超过2000个,需要10
m[i][j] = (co_occurrence value of terms (i, j))/ (occurrence value of term i)
术语(i,j)的出现次数=第i个术语和第j个术语同时出现的文件数量
术语j的出现是指出现术语j的文档数
我的代码运行良好。但这并不高效。对于大量文件,其中术语数量超过2000个,需要10分钟以上
这是我查找co_事件的代码-
int cooccurrence(IndexReader reader, String term_one, String term_two) throws IOException {
int common_doc_no = 0, finaldocno_one = 0, finaldocno_two = 0;
int termdocid_one[] = new int[6000];
int termdocid_two[] = new int[6000];
int first_docids[] = new int[6000];
int second_docids[] = new int[6000];
int k = 0;
for (java.util.Iterator<String> it = reader.getFieldNames(
FieldOption.ALL).iterator(); it.hasNext();) {
String fieldname = (String) it.next();
TermDocs t = reader.termDocs(new Term(fieldname, term_one));
while (t.next()) {
int x = t.doc();
if (termdocid_one[x] != 1) {
finaldocno_one++;
first_docids[k] = x;
k++;
}
termdocid_one[x] = 1;
}
}
/*
* System.out.println("value of finaldoc_one - " + finaldocno_one); for
* (int i = 0; i < finaldocno_one; i++) { System.out.println("" +
* first_docids[i]); }
*/
k = 0;
for (java.util.Iterator<String> it = reader.getFieldNames(
FieldOption.ALL).iterator(); it.hasNext();) {
String fieldname = (String) it.next();
TermDocs t = reader.termDocs(new Term(fieldname, term_two));
while (t.next()) {
int x = t.doc();
if (termdocid_two[x] != 1) {
finaldocno_two++;
second_docids[k] = x;
k++;
}
termdocid_two[x] = 1;
}
}
/*
* System.out.println("value of finaldoc_two - " + finaldocno_two);
*
* for (int i = 0; i < finaldocno_two; i++) { System.out.println("" +
* second_docids[i]); }
*/
int max;
int search = 0;
if (finaldocno_one > finaldocno_two) {
max = finaldocno_one;
search = 1;
} else {
max = finaldocno_two;
search = 2;
}
if (search == 1) {
for (int i = 0; i < max; i++) {
if (termdocid_two[first_docids[i]] == 1)
common_doc_no++;
}
} else if (search == 2) {
for (int i = 0; i < max; i++) {
if (termdocid_one[second_docids[i]] == 1)
common_doc_no++;
}
}
return common_doc_no;
}
int-coccurrence(索引读取器读取器、字符串项_-one、字符串项_-two)引发IOException{
int common_doc_no=0,finaldocno_one=0,finaldocno_two=0;
int termdocid_one[]=新int[6000];
int termdocid_two[]=新int[6000];
int first_docids[]=新int[6000];
int second_docids[]=新int[6000];
int k=0;
for(java.util.Iterator)it=reader.getFieldNames(
FieldOption.ALL).iterator();it.hasNext();){
String fieldname=(String)it.next();
TermDocs t=reader.TermDocs(新术语(字段名,术语_one));
while(t.next()){
intx=t.doc();
如果(termdocid_one[x]!=1){
finaldocno__one++;
第一个文档ID[k]=x;
k++;
}
termdocid_one[x]=1;
}
}
/*
*System.out.println(“finaldoc\u one的值-”+finaldoc No\u one);用于
*(inti=0;i最后一次){
最大值=最后一次;
搜索=1;
}否则{
最大值=最终时刻2;
搜索=2;
}
如果(搜索==1){
对于(int i=0;i
知识矩阵计算代码:-
void knowledge_matrix(double matrix[][], IndexReader reader, double avg_matrix[][]) throws IOException {
ArrayList<String> unique_terms_array = new ArrayList<>();
int totallength = unique_term_count(reader, unique_terms_array);
int co_occur_matrix[][] = new int[totallength + 3][totallength + 3];
double rowsum = 0;
for (int i = 1; i <= totallength; i++) {
rowsum = 0;
for (int j = 1; j <= totallength; j++) {
int co_occurence;
int occurence = docno_single_term(reader,
unique_terms_array.get(j - 1));
if (i > j) {
co_occurence = co_occur_matrix[i][j];
} else {
co_occurence = cooccurrence(reader,
unique_terms_array.get(i - 1),
unique_terms_array.get(j - 1));
co_occur_matrix[i][j] = co_occurence;
co_occur_matrix[j][i] = co_occurence;
}
matrix[i][j] = (float) co_occurence / (float) occurence;
rowsum += matrix[i][j];
if (i > 1)
{
avg_matrix[i - 1][j] = matrix[i - 1][j] - matrix[i - 1][0];
}
}
matrix[i][0] = rowsum / totallength;
}
for (int j = 1; j <= totallength; j++) {
avg_matrix[totallength][j] = matrix[totallength][j]
- matrix[totallength][0];
}
}
void knowledge_matrix(双矩阵[][],索引读取器读取器,双平均矩阵[][])引发IOException{
ArrayList unique_terms_array=新ArrayList();
int totalength=唯一项计数(读卡器、唯一项数组);
int co_出现_矩阵[][]=新int[totallength+3][totallength+3];
双行和=0;
对于(int i=1;i 1)
{
avg_矩阵[i-1][j]=矩阵[i-1][j]-矩阵[i-1][0];
}
}
矩阵[i][0]=行和/总长度;
}
对于(int j=1;j我认为您可以将term_one和term_two in one的查找过程放入for
循环中。您可以使用两个哈希集来保存找到的文档ID。然后使用termOneSet.retainal(termTwoSet)
来获取同时包含term_one和term_two的文档