Java 正则表达式查询如何在lucene上工作?
我正在尝试在我的应用程序中实现Lucene搜索引擎 我正在使用lucene 5.4.1 我已经成功地实现了lucene的WildEquiries和普通查询 但我主要关注的是使用正则表达式模式在文本文件中搜索特定文本 索引编写器代码:Java 正则表达式查询如何在lucene上工作?,java,lucene,Java,Lucene,我正在尝试在我的应用程序中实现Lucene搜索引擎 我正在使用lucene 5.4.1 我已经成功地实现了lucene的WildEquiries和普通查询 但我主要关注的是使用正则表达式模式在文本文件中搜索特定文本 索引编写器代码: public IndexWriter generateIndex(String docsPath) throws IOException { String indexPath = System.getProperty("java.io.tmpdir")
public IndexWriter generateIndex(String docsPath) throws IOException {
String indexPath = System.getProperty("java.io.tmpdir") +File.separator+"indexDirectory";
if (indexPath == null) {
throw new IOException("System property 'java.io.tmpdir' does not specify a tmp dir");
}
File tmpDir = new File(indexPath);
if (!tmpDir.exists()) {
boolean created = tmpDir.mkdirs();
if (!created) {
throw new IOException("Unable to create tmp dir " + tmpDir);
}
}
boolean create = true;
final Path docDir = Paths.get(docsPath);
if (!Files.isReadable(docDir)) {
System.out.println("Document directory '" + docDir.toAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
setIndexWriter(writer);
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
return getIndexWriter();
}
static void indexDocs(final IndexWriter writer, Path path) throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
} catch (IOException ignore) {
// don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
}
});
} else {
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
Document doc = new Document();
Field pathField = new StringField("path", file.toString(), Field.Store.NO);
doc.add(pathField);
doc.add(new LongField("modified", lastModified, Field.Store.NO));
doc.add(new TextField("contents",
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
public IndexWriter generateIndex(字符串docsPath)引发IOException{
字符串indexPath=System.getProperty(“java.io.tmpdir”)+File.separator+“indexDirectory”;
if(indepath==null){
抛出新IOException(“系统属性'java.io.tmpdir'未指定tmp目录”);
}
File tmpDir=新文件(indexPath);
如果(!tmpDir.exists()){
boolean created=tmpDir.mkdirs();
如果(!已创建){
抛出新IOException(“无法创建tmp目录”+tmpDir);
}
}
布尔创建=真;
最终路径docDir=Path.get(docsPath);
如果(!Files.isReadable(docDir)){
System.out.println(“文档目录””+docDir.toabsolutionPath()
+“'不存在或不可读,请检查路径”);
系统出口(1);
}
开始日期=新日期();
试一试{
println(“对目录的索引”'+indexath+“…”);
Directory dir=FSDirectory.open(path.get(indexPath));
Analyzer Analyzer=新的StandardAnalyzer();
IndexWriterConfig iwc=新的IndexWriterConfig(分析器);
如果(创建){
iwc.setOpenMode(OpenMode.CREATE);
}否则{
iwc.setOpenMode(OpenMode.CREATE_或_APPEND);
}
IndexWriter writer=新的IndexWriter(dir,iwc);
indexDocs(作者,docDir);
setIndexWriter(写入器);
日期结束=新日期();
System.out.println(end.getTime()-start.getTime()+“总毫秒”);
writer.close();
}捕获(IOE异常){
System.out.println(“捕获了一个“+e.getClass()+”\n,消息为“+e.getMessage()”);
}
返回getIndexWriter();
}
静态void indexDocs(最终IndexWriter写入程序,路径路径)引发IOException{
if(Files.isDirectory(path)){
walkFileTree(路径,新的SimpleFileVisitor(){
@凌驾
公共文件VisitResult visitFile(路径文件,基本文件属性属性属性)引发IOException{
试一试{
indexDoc(writer、file、attrs.lastModifiedTime().toMillis());
}捕获(IOException忽略){
//不要索引无法读取的文件。
}
返回FileVisitResult.CONTINUE;
}
});
}否则{
indexDoc(writer、path、Files.getLastModifiedTime(path.toMillis());
}
}
静态void indexDoc(IndexWriter writer,路径文件,long lastModified)引发IOException{
try(InputStream=Files.newInputStream(file)){
单据单据=新单据();
Field pathField=new StringField(“路径”,file.toString(),Field.Store.NO);
文件添加(路径域);
新增单据(新增LongField(“已修改”,上次修改,Field.Store.NO));
添加文档(新文本字段(“内容”),
新的BufferedReader(新的InputStreamReader(stream,StandardCharsets.UTF_8));
if(writer.getConfig().getOpenMode()==OpenMode.CREATE){
System.out.println(“添加”+文件);
writer.addDocument(doc);
}否则{
System.out.println(“更新”+文件);
writer.updateDocument(新术语(“路径”,file.toString()),doc);
}
}
}
索引搜索代码:
public IndexReader searchExecutor(String index, String queryString, RegexCapabilities capability) throws Exception {
String field = "contents";
String queries = null;
boolean raw = false;
int hitsPerPage = Integer.MAX_VALUE;
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
Query q = new RegexpQuery(new Term("text", queryString));
q = q.rewrite(reader);
RegexQuery query = new RegexQuery(new Term("\\s*(FIND|find)"));
if (capability != null)
query.setRegexImplementation(capability);
System.out.println("Searching for: " + query.toString(field));
searcher.search(query, null, 1000);
doSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);
//reader.close();
return reader;
}
public static void doSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,
boolean interactive)
throws IOException {
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
//generateIndex.deleteDocuments(query);
//generateIndex.getDirectory();
// TermsEnum.totalTermFreq();
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
File file = new File(path);
if (path != null) {
System.out.println((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i + 1) + ". " + "No path for this document");
}
}
}
public IndexReader searchExecutor(字符串索引、字符串查询字符串、RegexCapabilities功能)引发异常{
String field=“contents”;
字符串查询=null;
布尔原始=假;
int hitsPerPage=Integer.MAX_值;
IndexReader=DirectoryReader.open(FSDirectory.open(path.get(index));
IndexSearcher search=新的IndexSearcher(阅读器);
Analyzer Analyzer=新的StandardAnalyzer();
BufferedReader in=null;
查询q=新的RegexpQuery(新术语(“文本”,queryString));
q=q.重写(读卡器);
RegexQuery query=newregexquery(新术语(\\s*(FIND | FIND)”);
if(能力!=null)
query.setRegexImplementation(能力);
System.out.println(“搜索:”+query.toString(字段));
searcher.search(查询,空,1000);
doSearch(in、searcher、query、hitsPerPage、raw、querys==null&&queryString==null);
//reader.close();
返回读取器;
}
公共静态void doSearch(BufferedReader-in、indexsearch-search、Query-Query、int-hitsPerPage、boolean-raw、,
布尔(交互式)
抛出IOException{
TopDocs results=searcher.search(查询,5*hitsPerPage);
ScoreDoc[]点击次数=结果。scoreDocs;
//generateIndex.deleteDocuments(查询);
//generateIndex.getDirectory();
//TermsEnum.totalTermFreq();
int numTotalHits=results.totalHits;
System.out.println(numTotalHits+“总匹配文档”);
int start=0;
int end=Math.min(numTotalHits,hitsPerPage);
for(int i=start;i
请提供帮助。您的问题是关于在lucene中使用正则表达式进行搜索
RegexQuery
,请尝试RegexpQuery
\s*
开头,但不使用。大多数其他标记器将删除(也称为“拆分”)