Java Lucene tika索引失败
我使用Tika编写了一个索引示例(大部分抄袭自lucene in action电子书)。但它根本不为文档编制索引。编译或运行时没有错误。我试着为一个.pdf、.ppt、.doc甚至.txt文档编制索引,没有用,在搜索时返回0个点击,我注意文档中的单词。请看一下代码:Java Lucene tika索引失败,java,lucene,indexing,apache-tika,Java,Lucene,Indexing,Apache Tika,我使用Tika编写了一个索引示例(大部分抄袭自lucene in action电子书)。但它根本不为文档编制索引。编译或运行时没有错误。我试着为一个.pdf、.ppt、.doc甚至.txt文档编制索引,没有用,在搜索时返回0个点击,我注意文档中的单词。请看一下代码: public class TikaIndexer extends Indexer { private boolean DEBUG = false; static Set textualMetadataFields = new H
public class TikaIndexer extends Indexer {
private boolean DEBUG = false;
static Set textualMetadataFields = new HashSet();
static {
textualMetadataFields.add(Metadata.TITLE);
textualMetadataFields.add(Metadata.AUTHOR);
textualMetadataFields.add(Metadata.COMMENTS);
textualMetadataFields.add(Metadata.KEYWORDS);
textualMetadataFields.add(Metadata.DESCRIPTION);
textualMetadataFields.add(Metadata.SUBJECT);
}
public TikaIndexer(String indexDir) throws IOException {
super(indexDir);
}
@Override
protected boolean acceptFile(File f) {
return true;
}
@Override
protected Document getDocument(File f) throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY,
f.getCanonicalPath());
InputStream is = new FileInputStream(f);
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(10*1024*1024);
try {
parser.parse(is, handler, metadata, new ParseContext());
} finally {
is.close();
}
Document doc = new Document();
doc.add(new Field("contents", handler.toString(), Field.Store.NO, Field.Index.ANALYZED));
if (DEBUG) {
System.out.println(" intregul textt: " + handler.toString());
}
for (String name : metadata.names()) {
String value = metadata.get(name);
if (textualMetadataFields.contains(name)) {
doc.add(new Field("contents", value,
Field.Store.NO, Field.Index.ANALYZED));
}
doc.add(new Field(name, value, Field.Store.YES, Field.Index.NO));
if (DEBUG) {
System.out.println(" " + name + ": " + value);
}
}
if (DEBUG) {
System.out.println();
}
return doc;
}
}
和主要类别:
public static void main(String args[])
{
String indexDir = "src/indexDirectory";
String dataDir = "src/filesDirectory";
try
{
TikaConfig config = TikaConfig.getDefaultConfig();
List<MediaType> parsers = new ArrayList(config.getParser().getSupportedTypes(new ParseContext())); //3
Collections.sort(parsers);
Iterator<MediaType> it = parsers.iterator();
System.out.println(parsers.size());
System.out.println("Tipuri de parsere:");
while (it.hasNext()) {
System.out.println(" " + it.next());
}
System.out.println();
long start = new Date().getTime();
TikaIndexer indexer = new TikaIndexer(indexDir);
int numIndexed = indexer.index(dataDir);
long end = new Date().getTime();
System.out.println("Indexarea a " + numIndexed + " fisiere a durat "
+ (end - start) + " milisecunde.");
System.out.println();
System.out.println("--------------------------------------------------------------");
System.out.println();
}
catch (Exception ex)
{
System.out.println("Nu s-a putut realiza indexarea: ");
ex.printStackTrace();
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}
publicstaticvoidmain(字符串参数[])
{
String indexDir=“src/indexDirectory”;
String dataDir=“src/filesDirectory”;
尝试
{
TikaConfig config=TikaConfig.getDefaultConfig();
List parsers=new ArrayList(config.getParser().getSupportedTypes(new ParseContext());//3
Collections.sort(解析器);
Iterator it=parsers.Iterator();
System.out.println(parsers.size());
System.out.println(“Tipuri de parsere:”);
while(it.hasNext()){
System.out.println(“+it.next());
}
System.out.println();
长启动=新日期().getTime();
TikaIndexer indexer=新的TikaIndexer(indexDir);
int numIndexed=indexer.index(dataDir);
long end=new Date().getTime();
System.out.println(“Indexarea a”+numIndexed+“fisiere a durat”
+(结束-开始)+"米利塞孔德";;
System.out.println();
System.out.println(“---------------------------------------------------------------------------------”;
System.out.println();
}
捕获(例外情况除外)
{
System.out.println(“Nu s-a输出实现索引区域:”);
例如printStackTrace();
Logger.getLogger(Main.class.getName()).log(Level.SEVERE,null,ex);
}
}
如果您使用Luke,您是否能在索引中正确显示内容?您的意思是,如果仅使用lucene索引.txt文件?是的,lucene可以正常工作,但我需要在搜索中包含其他类型的文档。Gagravarr意味着您应该下载Luke,一个连接到索引并帮助您诊断的GUI客户端。@Gagravarr我尝试过这样做:我只使用Tika索引,然后使用Luke打开索引目录:无效目录错误。我用普通的lucene Indexer做了一个默认索引,现在Luke显示了indexe的信息…有什么想法吗?听起来你的问题是索引,而不是Tika。我建议你去读更多的Lucene教程(或者甚至买本书——它真的很好!),了解更多关于Lucene索引工作的知识。然后你应该能够找出你做错了什么