Java 为了索引、搜索和解析csv推文,我尝试了这段代码

Java 为了索引、搜索和解析csv推文,我尝试了这段代码,java,parsing,search,indexing,lucene,Java,Parsing,Search,Indexing,Lucene,我尝试了这段代码,以便索引、搜索和解析csv推文,但当执行时,第一条推文的字段不会显示,搜索类也不起作用,请有人帮我 import java.io.*; import java.util.StringTokenizer; import java.util.Scanner; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; impo

我尝试了这段代码,以便索引、搜索和解析csv推文,但当执行时,第一条推文的字段不会显示,搜索类也不起作用,请有人帮我

   import java.io.*;
import java.util.StringTokenizer;
import java.util.Scanner;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
class TweetDoc {
    protected static  String ID = "id";
    protected static String MAT="mat";
    protected static  String DATE = "date";
    protected static  String QUERY = "query";
    protected static  String USER = "user";
    protected static  String TEXT = "text";

TweetDoc(  String i,String m, String d, String q, String u, String t) {

                TweetDoc.ID=i;
                TweetDoc.MAT=m;
                TweetDoc.DATE = d;
                TweetDoc.QUERY=q;
                TweetDoc.USER = u;
                TweetDoc.TEXT=t;

                }}
public class Lucenetweet {
    public static final String INDEX_DIR = "D:\\Tweets\\index";
    public static void main(String[] args) throws CorruptIndexException, IOException {

    if (args.length == 0) {
//READ FROM FILES
        BufferedReader reader = null ;
                int count = 0;




        try {

            File file = new File("D:\\Tweets\\collection\\tweets.csv");
            while (file.exists()) {
                System.out.println("Reading from file '" + file + "'...");
            reader = new BufferedReader(new FileReader(file));
                // Read every line in the file, and parse each tweet.
                for (String line; (line = reader.readLine()) != null; ) {
                    count++; //Count number of tweets
                   System.out.println("Tweets = " + count);
                  Scanner s = new Scanner(line).useDelimiter("\",\"");
                       String ID=s.next();
                       String MAT=s.next();
                       String DATE = s.next();
                       String QUERY = s.next();
                   String USER = s.next();
                       String TEXT=s.next();

                       String i= TweetDoc.ID;
                       System.out.println("l'identificateur est: " +i); 
                       String d=TweetDoc.DATE;
                       System.out.println("la date est :" +d);
                       String m=TweetDoc.MAT;
                       System.out.println("la matricule est :" +m );
                       String t=TweetDoc.TEXT;
                       System.out.println("le texte est: " +t);

                  TweetDoc tweet1 = new TweetDoc( ID,MAT, DATE, QUERY, USER, TEXT);    
                  index(tweet1);

                }

                    reader.close();
                System.out.println("Current number of tweets = " + count);
                //file_no++;
                //file = new File("D:\\tweet\\collection\\tweet"+file_no+".csv");

            }

        }
catch (IOException e) {
            e.printStackTrace();
        }

        finally {

           try {
                reader.close();
                System.out.println("Total number of tweets = " + count);
            }
           catch (IOException e) {
                e.printStackTrace();
            }}


        }
    }
public static void index (TweetDoc tweet) {
        File index = new File(INDEX_DIR);
        IndexWriter writer = null;

        try {   
            IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
            writer = new IndexWriter(FSDirectory.open(index), indexConfig);
            Document luceneDoc = new Document();
            //luceneDoc.add(new Field("POLARITY", tweet.POLARITY, Field.Index.NO));
                        //luceneDoc.add(new Field("ID", TweetDoc.ID, Field.Index.NO));
                        luceneDoc.add(new Field("DATE", tweet.DATE, Field.Store.YES, Field.Index.NO));
                        //luceneDoc.add(new Field("QUERY", tweet.QUERY, Field.Index.NO));
            luceneDoc.add(new Field("USER", tweet.USER, Field.Store.YES, Field.Index.NO));
            luceneDoc.add(new Field("TEXT", tweet.TEXT, Field.Store.YES, Field.Index.ANALYZED));
            //luceneDoc.add(new Field("ptitle", tweet.ptitle, Field.Store.YES, Field.Index.ANALYZED));
            luceneDoc.setBoost((float)2.0);
            writer.addDocument(luceneDoc);
} catch (Exception ex) {
            ex.printStackTrace();
        } finally {
            if (writer !=null)
                try {
                    writer.close();
                } catch (CorruptIndexException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }

    }
 public static String[] search (String queryString, int topk) throws CorruptIndexException, IOException {

        IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(INDEX_DIR)));
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        QueryParser queryparser = new QueryParser(Version.LUCENE_36, "TEXT", new StandardAnalyzer(Version.LUCENE_36));
    try {
            StringTokenizer strtok = new StringTokenizer(queryString, " ~`!@#$%^&*()_-+={[}]|:;'<>,./?\"\'\\/\n\t\b\f\r");
            String querytoparse = " ";
            while(strtok.hasMoreElements()) {
                String token = strtok.nextToken();
                      querytoparse += "text:" + token;
}       
            Query query = queryparser.parse(querytoparse);
            System.out.println(query.toString());
            TopDocs results = indexSearcher.search(query, topk);
            int num_results = results.scoreDocs.length;
            System.out.println(num_results);
            String[] returnTweets = new String[num_results];
            for (int i = 0; i < num_results; i++) {
                                String temp = "@" + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("USER").stringValue();
                String DATE = indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("DATE").stringValue();
                DATE = DATE.replace("+0000", "");
                temp += ": " + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("QUERY").stringValue();
                temp += "<br/>" + DATE + "    Score: " +  results.scoreDocs[i].score;;
                System.out.println(indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("TEXT").stringValue());
                System.out.println("score: " + results.scoreDocs[i].score);
                returnTweets[i] = temp;

            }


            return returnTweets;            
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            indexSearcher.close();
        }
        return null;
    }


}
import java.io.*;
导入java.util.StringTokenizer;
导入java.util.Scanner;
导入org.apache.lucene.analysis.standard.StandardAnalyzer;
导入org.apache.lucene.document.document;
导入org.apache.lucene.document.Field;
导入org.apache.lucene.index.corruptindexception;
导入org.apache.lucene.index.IndexReader;
导入org.apache.lucene.index.IndexWriter;
导入org.apache.lucene.index.IndexWriterConfig;
导入org.apache.lucene.index.Term;
导入org.apache.lucene.queryParser.queryParser;
导入org.apache.lucene.search.indexsearch;
导入org.apache.lucene.search.Query;
导入org.apache.lucene.search.TopDocs;
导入org.apache.lucene.store.FSDirectory;
导入org.apache.lucene.util.Version;
类TweetDoc{
受保护的静态字符串ID=“ID”;
受保护的静态字符串MAT=“MAT”;
受保护的静态字符串DATE=“DATE”;
受保护的静态字符串QUERY=“QUERY”;
受保护的静态字符串USER=“USER”;
受保护的静态字符串TEXT=“TEXT”;
TweetDoc(字符串i、字符串m、字符串d、字符串q、字符串u、字符串t){
TweetDoc.ID=i;
TweetDoc.MAT=m;
TweetDoc.DATE=d;
TweetDoc.QUERY=q;
TweetDoc.USER=u;
TweetDoc.TEXT=t;
}}
公共类LuceNet{
公共静态最终字符串索引\u DIR=“D:\\Tweets\\INDEX”;
公共静态void main(字符串[]args)引发CorruptIndexException,IOException{
如果(args.length==0){
//从文件中读取
BufferedReader reader=null;
整数计数=0;
试一试{
File File=新文件(“D:\\Tweets\\collection\\Tweets.csv”);
while(file.exists()){
System.out.println(“从文件读取“+”文件+“…”);
reader=newbufferedreader(newfilereader(file));
//读取文件中的每一行,并解析每条推文。
for(字符串行;(line=reader.readLine())!=null;){
count++;//统计推文的数量
System.out.println(“Tweets=“+count”);
扫描器s=新扫描器(行)。使用分隔符(“\”,\”);
字符串ID=s.next();
字符串MAT=s.next();
字符串日期=s.next();
字符串查询=s.next();
字符串USER=s.next();
字符串TEXT=s.next();
字符串i=TweetDoc.ID;
System.out.println(“l'identicateur est:+i”);
字符串d=TweetDoc.DATE;
系统输出打印项次(“la日期测试:+d);
字符串m=TweetDoc.MAT;
系统输出println(“矩阵估计:+m”);
字符串t=TweetDoc.TEXT;
System.out.println(“文本测试:+t”);
TweetDoc tweet1=新的TweetDoc(ID、MAT、日期、查询、用户、文本);
索引(tweet1);
}
reader.close();
System.out.println(“当前推文数=“+count”);
//文件_no++;
//文件=新文件(“D:\\tweet\\collection\\tweet”+文件号+“.csv”);
}
}
捕获(IOE异常){
e、 printStackTrace();
}
最后{
试一试{
reader.close();
System.out.println(“推文总数=”+计数);
}
捕获(IOE异常){
e、 printStackTrace();
}}
}
}
公共静态无效索引(TweetDoc tweet){
文件索引=新文件(索引\目录);
IndexWriter writer=null;
试试{
IndexWriterConfig indexConfig=新的IndexWriterConfig(Version.LUCENE_36,新的StandardAnalyzer(Version.LUCENE_36));
writer=newindexwriter(FSDirectory.open(index),indexConfig);
Document luceneDoc=新文档();
//添加(新字段(“POLARITY”,tweet.POLARITY,Field.Index.NO));
//luceneDoc.add(新字段(“ID”,TweetDoc.ID,Field.Index.NO));
添加(新字段(“日期”,tweet.DATE,Field.Store.YES,Field.Index.NO));
//添加(新字段(“QUERY”,tweet.QUERY,Field.Index.NO));
添加(新字段(“用户”,tweet.USER,Field.Store.YES,Field.Index.NO));
添加(新字段(“TEXT”,tweet.TEXT,Field.Store.YES,Field.Index.analysis));
//添加(新字段(“ptitle”,tweet.ptitle,Field.Store.YES,Field.Index.analysis));
luceneDoc.setBoost((float)2.0);
writer.addDocument(luceneDoc);
}捕获(例外情况除外){
例如printStackTrace();
}最后{
if(writer!=null)
试一试{
writer.close();
}捕获(腐蚀异常e){
e、 printStackTrace();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
公共静态字符串[]搜索(字符串queryString,int-topk)抛出corruptindexeception,IOException{
IndexReader IndexReader=IndexReader.open(FSDirectory.open(新文件(INDEX_DIR));
IndexSearcher IndexSearcher=新的IndexSearcher(indexReader);
QueryParser QueryParser=新的QueryParser(Version.LUCENE_36,“TEXT”,新的StandardAnalyzer(Version.LUCENE_36));
试一试{
StringTokenizer strtok=新的StringTokenizer(查询字符串,“~`!@$%^&*()”、./?\“\'\/\n\t\b\f\r”);
字符串querytoparse=“”;
while(strtok.hasMoreElements()){