ElasticSearch-Java API索引100K+;使用producer&;消费者
我使用JavaAPI索引pdf。我已经安装了ingest attachement处理器插件,并从我的java代码中,将PDF转换为base64和索引编码格式的PDF 实际上,我的机器d:\驱动器中有PDF。文件路径在名为documents\u local的ElasticSearch索引中可用。因此,我从本地索引的文档中获取所有记录并获取文件路径。然后,我读取pdf文件并将其编码为base64。然后索引它们 对于这个过程,我使用ScrollRequestAPI从索引中获取文件路径,因为我有超过100k的文档。因此,使用下面的java代码索引20000个PDF需要8小时的时间 所以,我尝试将这个索引过程分开 我创建了3个类ElasticSearch-Java API索引100K+;使用producer&;消费者,java,elasticsearch,elastic-stack,Java,elasticsearch,Elastic Stack,我使用JavaAPI索引pdf。我已经安装了ingest attachement处理器插件,并从我的java代码中,将PDF转换为base64和索引编码格式的PDF 实际上,我的机器d:\驱动器中有PDF。文件路径在名为documents\u local的ElasticSearch索引中可用。因此,我从本地索引的文档中获取所有记录并获取文件路径。然后,我读取pdf文件并将其编码为base64。然后索引它们 对于这个过程,我使用ScrollRequestAPI从索引中获取文件路径,因为我有超过10
Controller.java
Producer.java
Consumer.java
从Controller.java
class中,我从索引中读取所有文件路径,并将所有文件路径存储到ArrayList中,然后传递给Producer
class
从Producer.java
类中,我使用文件路径读取PDF,并将其转换为base64并推入队列
我将从Consumer.java
class读取队列中由producer.java
class发布的所有消息
我的想法是,我想为Consumer.java
类中的编码文件编制索引。(尚未实施,我不确定如何实施)
请在下面找到我的java代码
Controller.java
public class Controller {
private static final int QUEUE_SIZE = 2;
private static BlockingQueue<String> queue;
private static Collection<Thread> producerThreadCollection, allThreadCollection;
private final static String INDEX = "documents_local";
private final static String ATTACHMENT = "document_suggestion";
private final static String TYPE = "doc";
private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());
public static void main(String[] args) throws IOException {
RestHighLevelClient restHighLevelClient = null;
Document doc=new Document();
List<String> filePathList = new ArrayList<String>();
producerThreadCollection = new ArrayList<Thread>();
allThreadCollection = new ArrayList<Thread>();
queue = new LinkedBlockingDeque<String>(QUEUE_SIZE);
SearchRequest searchRequest = new SearchRequest(INDEX);
searchRequest.types(TYPE);
final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(60L)); //part of Scroll API
searchRequest.scroll(scroll); //part of Scroll API
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder qb = QueryBuilders.matchAllQuery();
searchSourceBuilder.query(qb);
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = SearchEngineClient.getInstance3().search(searchRequest);
String scrollId = searchResponse.getScrollId(); //part of Scroll API
SearchHit[] searchHits = searchResponse.getHits().getHits();
long totalHits=searchResponse.getHits().totalHits;
logger.info("Total Hits --->"+totalHits);
//part of Scroll API -- Starts
while (searchHits != null && searchHits.length > 0) {
SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId);
scrollRequest.scroll(scroll);
searchResponse = SearchEngineClient.getInstance3().searchScroll(scrollRequest);
scrollId = searchResponse.getScrollId();
searchHits = searchResponse.getHits().getHits();
for (SearchHit hit : searchHits) {
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
if(sourceAsMap != null) {
doc.setId((int) sourceAsMap.get("id"));
doc.setApp_language(String.valueOf(sourceAsMap.get("app_language")));
}
filePathList.add(doc.getPath().concat(doc.getFilename()));
}
}
createAndStartProducers(filePathList);
createAndStartConsumers(filePathList);
for(Thread t: allThreadCollection){
try {
t.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
System.out.println("Controller finished");
}
private static void createAndStartProducers(List<String> filePathList){
for(int i = 1; i <= filePathList.size(); i++){
Producer producer = new Producer(Paths.get(filePathList.get(i)), queue);
Thread producerThread = new Thread(producer,"producer-"+i);
producerThreadCollection.add(producerThread);
producerThread.start();
}
allThreadCollection.addAll(producerThreadCollection);
}
private static void createAndStartConsumers(List<String> filePathList){
for(int i = 0; i < filePathList.size(); i++){
Thread consumerThread = new Thread(new Consumer(queue), "consumer-"+i);
allThreadCollection.add(consumerThread);
consumerThread.start();
}
}
public static boolean isProducerAlive(){
for(Thread t: producerThreadCollection){
if(t.isAlive())
return true;
}
return false;
}
}
不是一个直接的答案,但你看过FSCrawler吗@dadoonet-是的,我看了一眼,但它对我来说非常复杂。但是你试着只用它吗?
public class Producer implements Runnable {
private Path fileToRead;
private BlockingQueue<String> queue;
File file=null;
public Producer(Path filePath, BlockingQueue<String> q){
fileToRead = filePath;
queue = q;
}
public void run() {
String encodedfile = null;
BufferedReader reader = null;
try {
reader = Files.newBufferedReader(fileToRead);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
File file=new File(reader.toString());
if(file.exists() && !file.isDirectory()) {
try {
FileInputStream fileInputStreamReader = new FileInputStream(file);
byte[] bytes = new byte[(int) file.length()];
fileInputStreamReader.read(bytes);
encodedfile = new String(Base64.getEncoder().encodeToString(bytes));
fileInputStreamReader.close();
System.out.println(Thread.currentThread().getName()+" finished");
} catch (IOException e) {
e.printStackTrace();
}
}
else
{
System.out.println("File not exists");
}
}
}
public class Consumer implements Runnable {
private BlockingQueue<String> queue;
File file=null;
public Consumer(BlockingQueue<String> q){
queue = q;
}
public void run(){
while(true){
String line = queue.poll();
if(line == null && !Controller.isProducerAlive())
return;
if(line != null){
System.out.println(Thread.currentThread().getName()+" processing line: "+line);
//Do something with the line here like see if it contains a string
}
}
}
}
jsonMap = new HashMap<>();
jsonMap.put("id", doc.getId());
jsonMap.put("app_language", doc.getApp_language());
jsonMap.put("fileContent", result);
String id=Long.toString(doc.getId());
IndexRequest request = new IndexRequest(ATTACHMENT, "doc", id )
.source(jsonMap)
.setPipeline(ATTACHMENT);