Java 为了索引、搜索和解析csv推文,我尝试了这段代码
我尝试了这段代码,以便索引、搜索和解析csv推文,但当执行时,第一条推文的字段不会显示,搜索类也不起作用,请有人帮我Java 为了索引、搜索和解析csv推文,我尝试了这段代码,java,parsing,search,indexing,lucene,Java,Parsing,Search,Indexing,Lucene,我尝试了这段代码,以便索引、搜索和解析csv推文,但当执行时,第一条推文的字段不会显示,搜索类也不起作用,请有人帮我 import java.io.*; import java.util.StringTokenizer; import java.util.Scanner; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; impo
import java.io.*;
import java.util.StringTokenizer;
import java.util.Scanner;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
class TweetDoc {
protected static String ID = "id";
protected static String MAT="mat";
protected static String DATE = "date";
protected static String QUERY = "query";
protected static String USER = "user";
protected static String TEXT = "text";
TweetDoc( String i,String m, String d, String q, String u, String t) {
TweetDoc.ID=i;
TweetDoc.MAT=m;
TweetDoc.DATE = d;
TweetDoc.QUERY=q;
TweetDoc.USER = u;
TweetDoc.TEXT=t;
}}
public class Lucenetweet {
public static final String INDEX_DIR = "D:\\Tweets\\index";
public static void main(String[] args) throws CorruptIndexException, IOException {
if (args.length == 0) {
//READ FROM FILES
BufferedReader reader = null ;
int count = 0;
try {
File file = new File("D:\\Tweets\\collection\\tweets.csv");
while (file.exists()) {
System.out.println("Reading from file '" + file + "'...");
reader = new BufferedReader(new FileReader(file));
// Read every line in the file, and parse each tweet.
for (String line; (line = reader.readLine()) != null; ) {
count++; //Count number of tweets
System.out.println("Tweets = " + count);
Scanner s = new Scanner(line).useDelimiter("\",\"");
String ID=s.next();
String MAT=s.next();
String DATE = s.next();
String QUERY = s.next();
String USER = s.next();
String TEXT=s.next();
String i= TweetDoc.ID;
System.out.println("l'identificateur est: " +i);
String d=TweetDoc.DATE;
System.out.println("la date est :" +d);
String m=TweetDoc.MAT;
System.out.println("la matricule est :" +m );
String t=TweetDoc.TEXT;
System.out.println("le texte est: " +t);
TweetDoc tweet1 = new TweetDoc( ID,MAT, DATE, QUERY, USER, TEXT);
index(tweet1);
}
reader.close();
System.out.println("Current number of tweets = " + count);
//file_no++;
//file = new File("D:\\tweet\\collection\\tweet"+file_no+".csv");
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
try {
reader.close();
System.out.println("Total number of tweets = " + count);
}
catch (IOException e) {
e.printStackTrace();
}}
}
}
public static void index (TweetDoc tweet) {
File index = new File(INDEX_DIR);
IndexWriter writer = null;
try {
IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
writer = new IndexWriter(FSDirectory.open(index), indexConfig);
Document luceneDoc = new Document();
//luceneDoc.add(new Field("POLARITY", tweet.POLARITY, Field.Index.NO));
//luceneDoc.add(new Field("ID", TweetDoc.ID, Field.Index.NO));
luceneDoc.add(new Field("DATE", tweet.DATE, Field.Store.YES, Field.Index.NO));
//luceneDoc.add(new Field("QUERY", tweet.QUERY, Field.Index.NO));
luceneDoc.add(new Field("USER", tweet.USER, Field.Store.YES, Field.Index.NO));
luceneDoc.add(new Field("TEXT", tweet.TEXT, Field.Store.YES, Field.Index.ANALYZED));
//luceneDoc.add(new Field("ptitle", tweet.ptitle, Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.setBoost((float)2.0);
writer.addDocument(luceneDoc);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (writer !=null)
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static String[] search (String queryString, int topk) throws CorruptIndexException, IOException {
IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(INDEX_DIR)));
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
QueryParser queryparser = new QueryParser(Version.LUCENE_36, "TEXT", new StandardAnalyzer(Version.LUCENE_36));
try {
StringTokenizer strtok = new StringTokenizer(queryString, " ~`!@#$%^&*()_-+={[}]|:;'<>,./?\"\'\\/\n\t\b\f\r");
String querytoparse = " ";
while(strtok.hasMoreElements()) {
String token = strtok.nextToken();
querytoparse += "text:" + token;
}
Query query = queryparser.parse(querytoparse);
System.out.println(query.toString());
TopDocs results = indexSearcher.search(query, topk);
int num_results = results.scoreDocs.length;
System.out.println(num_results);
String[] returnTweets = new String[num_results];
for (int i = 0; i < num_results; i++) {
String temp = "@" + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("USER").stringValue();
String DATE = indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("DATE").stringValue();
DATE = DATE.replace("+0000", "");
temp += ": " + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("QUERY").stringValue();
temp += "<br/>" + DATE + " Score: " + results.scoreDocs[i].score;;
System.out.println(indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("TEXT").stringValue());
System.out.println("score: " + results.scoreDocs[i].score);
returnTweets[i] = temp;
}
return returnTweets;
} catch (Exception e) {
e.printStackTrace();
} finally {
indexSearcher.close();
}
return null;
}
}
import java.io.*;
导入java.util.StringTokenizer;
导入java.util.Scanner;
导入org.apache.lucene.analysis.standard.StandardAnalyzer;
导入org.apache.lucene.document.document;
导入org.apache.lucene.document.Field;
导入org.apache.lucene.index.corruptindexception;
导入org.apache.lucene.index.IndexReader;
导入org.apache.lucene.index.IndexWriter;
导入org.apache.lucene.index.IndexWriterConfig;
导入org.apache.lucene.index.Term;
导入org.apache.lucene.queryParser.queryParser;
导入org.apache.lucene.search.indexsearch;
导入org.apache.lucene.search.Query;
导入org.apache.lucene.search.TopDocs;
导入org.apache.lucene.store.FSDirectory;
导入org.apache.lucene.util.Version;
类TweetDoc{
受保护的静态字符串ID=“ID”;
受保护的静态字符串MAT=“MAT”;
受保护的静态字符串DATE=“DATE”;
受保护的静态字符串QUERY=“QUERY”;
受保护的静态字符串USER=“USER”;
受保护的静态字符串TEXT=“TEXT”;
TweetDoc(字符串i、字符串m、字符串d、字符串q、字符串u、字符串t){
TweetDoc.ID=i;
TweetDoc.MAT=m;
TweetDoc.DATE=d;
TweetDoc.QUERY=q;
TweetDoc.USER=u;
TweetDoc.TEXT=t;
}}
公共类LuceNet{
公共静态最终字符串索引\u DIR=“D:\\Tweets\\INDEX”;
公共静态void main(字符串[]args)引发CorruptIndexException,IOException{
如果(args.length==0){
//从文件中读取
BufferedReader reader=null;
整数计数=0;
试一试{
File File=新文件(“D:\\Tweets\\collection\\Tweets.csv”);
while(file.exists()){
System.out.println(“从文件读取“+”文件+“…”);
reader=newbufferedreader(newfilereader(file));
//读取文件中的每一行,并解析每条推文。
for(字符串行;(line=reader.readLine())!=null;){
count++;//统计推文的数量
System.out.println(“Tweets=“+count”);
扫描器s=新扫描器(行)。使用分隔符(“\”,\”);
字符串ID=s.next();
字符串MAT=s.next();
字符串日期=s.next();
字符串查询=s.next();
字符串USER=s.next();
字符串TEXT=s.next();
字符串i=TweetDoc.ID;
System.out.println(“l'identicateur est:+i”);
字符串d=TweetDoc.DATE;
系统输出打印项次(“la日期测试:+d);
字符串m=TweetDoc.MAT;
系统输出println(“矩阵估计:+m”);
字符串t=TweetDoc.TEXT;
System.out.println(“文本测试:+t”);
TweetDoc tweet1=新的TweetDoc(ID、MAT、日期、查询、用户、文本);
索引(tweet1);
}
reader.close();
System.out.println(“当前推文数=“+count”);
//文件_no++;
//文件=新文件(“D:\\tweet\\collection\\tweet”+文件号+“.csv”);
}
}
捕获(IOE异常){
e、 printStackTrace();
}
最后{
试一试{
reader.close();
System.out.println(“推文总数=”+计数);
}
捕获(IOE异常){
e、 printStackTrace();
}}
}
}
公共静态无效索引(TweetDoc tweet){
文件索引=新文件(索引\目录);
IndexWriter writer=null;
试试{
IndexWriterConfig indexConfig=新的IndexWriterConfig(Version.LUCENE_36,新的StandardAnalyzer(Version.LUCENE_36));
writer=newindexwriter(FSDirectory.open(index),indexConfig);
Document luceneDoc=新文档();
//添加(新字段(“POLARITY”,tweet.POLARITY,Field.Index.NO));
//luceneDoc.add(新字段(“ID”,TweetDoc.ID,Field.Index.NO));
添加(新字段(“日期”,tweet.DATE,Field.Store.YES,Field.Index.NO));
//添加(新字段(“QUERY”,tweet.QUERY,Field.Index.NO));
添加(新字段(“用户”,tweet.USER,Field.Store.YES,Field.Index.NO));
添加(新字段(“TEXT”,tweet.TEXT,Field.Store.YES,Field.Index.analysis));
//添加(新字段(“ptitle”,tweet.ptitle,Field.Store.YES,Field.Index.analysis));
luceneDoc.setBoost((float)2.0);
writer.addDocument(luceneDoc);
}捕获(例外情况除外){
例如printStackTrace();
}最后{
if(writer!=null)
试一试{
writer.close();
}捕获(腐蚀异常e){
e、 printStackTrace();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
公共静态字符串[]搜索(字符串queryString,int-topk)抛出corruptindexeception,IOException{
IndexReader IndexReader=IndexReader.open(FSDirectory.open(新文件(INDEX_DIR));
IndexSearcher IndexSearcher=新的IndexSearcher(indexReader);
QueryParser QueryParser=新的QueryParser(Version.LUCENE_36,“TEXT”,新的StandardAnalyzer(Version.LUCENE_36));
试一试{
StringTokenizer strtok=新的StringTokenizer(查询字符串,“~`!@$%^&*()”、./?\“\'\/\n\t\b\f\r”);
字符串querytoparse=“”;
while(strtok.hasMoreElements()){