Java 更新apachelucene索引文件
我正在使用ApacheLucene库为我的网站创建搜索功能。该网站的所有内容都来自Sharepoint RSSFeeds,因此每次我都必须浏览所有RSSFeed URL并阅读内容。为了加快搜索功能,我创建了一个计划任务,每一小时进行一次索引:Java 更新apachelucene索引文件,java,search,lucene,Java,Search,Lucene,我正在使用ApacheLucene库为我的网站创建搜索功能。该网站的所有内容都来自Sharepoint RSSFeeds,因此每次我都必须浏览所有RSSFeed URL并阅读内容。为了加快搜索功能,我创建了一个计划任务,每一小时进行一次索引: <bean id="rssIndexerService" class="com.lloydsbanking.webmi.service.RSSIndexerService" /> <task:scheduled-tasks>
<bean id="rssIndexerService" class="com.lloydsbanking.webmi.service.RSSIndexerService" />
<task:scheduled-tasks> <task scheduled ref="rssIndexerService" method="indexUrls" cron="0 0 * * * MON-FRI" /></task:scheduled-tasks>
问题是,如果我创建了一个新内容,那么在服务器运行时和调用计划任务后,搜索不会显示新内容,而且如果我删除了一个条目,它也不会显示从索引文件中删除的条目。以下是索引代码:
@Service
public class RSSIndexerService extends RSSReader {
@Autowired
private RSSFeedUrl rssFeedUrl;
private IndexWriter indexWriter = null;
private String indexPath = "C:\\MI\\index";
Logger log = Logger.getLogger(RSSIndexerService.class.getName());
public void indexUrls() throws IOException {
Date start = new Date();
IndexWriter writer = getIndexWriter();
log.info("Reading all the Urls in the Sharepoint");
Iterator<Entry<String, String>> entries = rssFeedUrl.getUrlMap().entrySet().iterator();
try {
while (entries.hasNext()) {
Entry<String, String> mapEntry = entries.next();
String url = mapEntry.getValue();
SyndFeed feed = rssReader(url);
for (Object entry : feed.getEntries()) {
SyndEntry syndEntry = (SyndEntry) entry;
SyndContent desc = syndEntry.getDescription();
if (desc != null) {
String text = desc.getValue();
if ("text/html".equals(desc.getType())) {
Document doc = new Document();
text = extractText(text);
Field fieldTitle = new StringField("title", syndEntry.getTitle(), Field.Store.YES);
doc.add(fieldTitle);
Field pathField = new StringField("path", url, Field.Store.YES);
doc.add(pathField);
doc.add(new TextField("contents", text, Field.Store.YES));
// New index, so we just add the document (no old document can be there):
writer.addDocument(doc);
}
}
}
}
} finally {
// closeIndexWriter();
}
Date end = new Date();
log.info(end.getTime() - start.getTime() + " total milliseconds");
}
public IndexWriter getIndexWriter() throws IOException {
if (indexWriter == null) {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
log.info("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
indexWriter = new IndexWriter(dir, config);
}
return indexWriter;
}
@PreDestroy
public void closeIndexWriter() throws IOException {
if (indexWriter != null) {
System.out.println("Done with indexing ...");
indexWriter.close();
}
}
}
@服务
公共类RSSIndexerService扩展了RSSReader{
@自动连线
私有RSSFeedUrl RSSFeedUrl;
私有IndexWriter IndexWriter=null;
私有字符串indexPath=“C:\\MI\\index”;
Logger log=Logger.getLogger(RSSIndexerService.class.getName());
public void indexUrls()引发IOException{
开始日期=新日期();
IndexWriter writer=getIndexWriter();
log.info(“读取Sharepoint中的所有URL”);
迭代器条目=rssFeedUrl.getUrlMap().entrySet().Iterator();
试一试{
while(entries.hasNext()){
Entry mapEntry=entries.next();
字符串url=mapEntry.getValue();
SyndFeed feed=rssReader(url);
for(对象项:feed.getEntries()){
SyndEntry SyndEntry=(SyndEntry)条目;
SyndContent desc=syndEntry.getDescription();
如果(描述!=null){
字符串文本=desc.getValue();
if(“text/html”.equals(desc.getType())){
单据单据=新单据();
text=提取文本(text);
Field fieldTitle=new StringField(“title”,syndEntry.getTitle(),Field.Store.YES);
文件添加(字段标题);
Field pathField=new StringField(“路径”,url,Field.Store.YES);
文件添加(路径域);
doc.add(新文本字段(“contents”,text,Field.Store.YES));
//新索引,因此我们只需添加文档(没有旧文档):
writer.addDocument(doc);
}
}
}
}
}最后{
//closeIndexWriter();
}
日期结束=新日期();
log.info(end.getTime()-start.getTime()+“总毫秒”);
}
公共IndexWriter getIndexWriter()引发IOException{
if(indexWriter==null){
Analyzer Analyzer=新的标准分析仪(LUCENE_47版);
info(“对目录的索引”'+indexath+“…”);
Directory dir=FSDirectory.open(新文件(indexPath));
IndexWriterConfig配置=新的IndexWriterConfig(Version.LUCENE_47,analyzer);
config.setOpenMode(OpenMode.CREATE\u或\u APPEND);
indexWriter=新的indexWriter(目录,配置);
}
返回索引器;
}
@发情前期
public void closeIndexWriter()引发IOException{
if(indexWriter!=null){
System.out.println(“完成索引…”);
indexWriter.close();
}
}
}
我知道问题可能是由config.setOpenMode(OpenMode.CREATE_或_APPEND)引起的;,但是我不知道如何解决它。我想出了一个主意,先检查目录是否为空,如果不是空,然后删除以前的索引,然后每次在OpenMode中进行索引。创建:
File path = new File(System.getProperty("java.io.tmpdir")+"\\index");
Directory dir = FSDirectory.open(path);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
if (path.list() != null) {
log.info("Delete previous indexes ...");
FileUtils.cleanDirectory(path);
}
config.setOpenMode(OpenMode.CREATE);
然后我简单地使用addDocument():
if ("text/html".equals(desc.getType())) {
...
// New index, so we just add the document (no old document can be there):
indexWriter.addDocument(doc);
}