Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/306.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/blackberry/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
ElasticSearch-Java API索引100K+;使用producer&;消费者_Java_<img Src="//i.stack.imgur.com/RUiNP.png" Height="16" Width="18" Alt="" Class="sponsor Tag Img">elasticsearch_Elastic Stack - Fatal编程技术网 elasticsearch,elastic-stack,Java,elasticsearch,Elastic Stack" /> elasticsearch,elastic-stack,Java,elasticsearch,Elastic Stack" />

ElasticSearch-Java API索引100K+;使用producer&;消费者

ElasticSearch-Java API索引100K+;使用producer&;消费者,java,elasticsearch,elastic-stack,Java,elasticsearch,Elastic Stack,我使用JavaAPI索引pdf。我已经安装了ingest attachement处理器插件,并从我的java代码中,将PDF转换为base64和索引编码格式的PDF 实际上,我的机器d:\驱动器中有PDF。文件路径在名为documents\u local的ElasticSearch索引中可用。因此,我从本地索引的文档中获取所有记录并获取文件路径。然后,我读取pdf文件并将其编码为base64。然后索引它们 对于这个过程,我使用ScrollRequestAPI从索引中获取文件路径,因为我有超过10

我使用JavaAPI索引pdf。我已经安装了ingest attachement处理器插件,并从我的java代码中,将PDF转换为base64和索引编码格式的PDF

实际上,我的机器d:\驱动器中有PDF。文件路径在名为documents\u local的ElasticSearch索引中可用。因此,我从本地索引的文档中获取所有记录并获取文件路径。然后,我读取pdf文件并将其编码为base64。然后索引它们

对于这个过程,我使用ScrollRequestAPI从索引中获取文件路径,因为我有超过100k的文档。因此,使用下面的java代码索引20000个PDF需要8小时的时间

所以,我尝试将这个索引过程分开

我创建了3个类

Controller.java
Producer.java
Consumer.java
Controller.java
class中,我从索引中读取所有文件路径,并将所有文件路径存储到ArrayList中,然后传递给
Producer
class

Producer.java
类中,我使用文件路径读取PDF,并将其转换为base64并推入
队列

我将从
Consumer.java
class读取队列中由
producer.java
class发布的所有消息

我的想法是,我想为
Consumer.java
类中的编码文件编制索引。(尚未实施,我不确定如何实施)

请在下面找到我的java代码

Controller.java

public class Controller {

    private static final int QUEUE_SIZE = 2;
    private static BlockingQueue<String> queue;
    private static Collection<Thread> producerThreadCollection, allThreadCollection;

    private final static String INDEX = "documents_local";  
    private final static String ATTACHMENT = "document_suggestion";
    private final static String TYPE = "doc";
    private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());

    public static void main(String[] args) throws IOException {

        RestHighLevelClient restHighLevelClient = null;
        Document doc=new Document();
        List<String> filePathList = new ArrayList<String>();


        producerThreadCollection = new ArrayList<Thread>();
        allThreadCollection = new ArrayList<Thread>();
        queue = new LinkedBlockingDeque<String>(QUEUE_SIZE);

        SearchRequest searchRequest = new SearchRequest(INDEX); 
        searchRequest.types(TYPE);
        final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(60L)); //part of Scroll API

        searchRequest.scroll(scroll); //part of Scroll API
        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
        QueryBuilder qb = QueryBuilders.matchAllQuery();

        searchSourceBuilder.query(qb);
        searchRequest.source(searchSourceBuilder);


        SearchResponse searchResponse = SearchEngineClient.getInstance3().search(searchRequest);
        String scrollId = searchResponse.getScrollId(); //part of Scroll API
        SearchHit[] searchHits = searchResponse.getHits().getHits();
        long totalHits=searchResponse.getHits().totalHits;
        logger.info("Total Hits --->"+totalHits);

        //part of Scroll API -- Starts
        while (searchHits != null && searchHits.length > 0) { 
            SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); 
            scrollRequest.scroll(scroll);
            searchResponse = SearchEngineClient.getInstance3().searchScroll(scrollRequest);
            scrollId = searchResponse.getScrollId();
            searchHits = searchResponse.getHits().getHits();

            for (SearchHit hit : searchHits) {


                Map<String, Object> sourceAsMap = hit.getSourceAsMap();


                if(sourceAsMap != null) {  
                    doc.setId((int) sourceAsMap.get("id"));
                    doc.setApp_language(String.valueOf(sourceAsMap.get("app_language")));
               }

                filePathList.add(doc.getPath().concat(doc.getFilename()));
            }
        }


        createAndStartProducers(filePathList);
        createAndStartConsumers(filePathList);

        for(Thread t: allThreadCollection){
            try {
                t.join();
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        System.out.println("Controller finished");
    }

    private static void createAndStartProducers(List<String> filePathList){
        for(int i = 1; i <= filePathList.size(); i++){
            Producer producer = new Producer(Paths.get(filePathList.get(i)), queue);
            Thread producerThread = new Thread(producer,"producer-"+i);
            producerThreadCollection.add(producerThread);
            producerThread.start();
        }
        allThreadCollection.addAll(producerThreadCollection);
    }

    private static void createAndStartConsumers(List<String> filePathList){
        for(int i = 0; i < filePathList.size(); i++){
            Thread consumerThread = new Thread(new Consumer(queue), "consumer-"+i);
            allThreadCollection.add(consumerThread);
            consumerThread.start();
        }
    }

    public static boolean isProducerAlive(){
        for(Thread t: producerThreadCollection){
            if(t.isAlive())
                return true;
        }
        return false;
    }

}

不是一个直接的答案,但你看过FSCrawler吗@dadoonet-是的,我看了一眼,但它对我来说非常复杂。但是你试着只用它吗?
public class Producer implements Runnable {

     private Path fileToRead;
        private BlockingQueue<String> queue;
        File file=null;

        public Producer(Path filePath, BlockingQueue<String> q){
            fileToRead = filePath;
            queue = q;
        }

        public void run() {

                String encodedfile = null;

                BufferedReader reader = null;
                try {
                    reader = Files.newBufferedReader(fileToRead);
                } catch (IOException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }

                File file=new File(reader.toString());

                if(file.exists() && !file.isDirectory()) {

                    try {

                        FileInputStream fileInputStreamReader = new FileInputStream(file);
                        byte[] bytes = new byte[(int) file.length()];
                        fileInputStreamReader.read(bytes);
                        encodedfile = new String(Base64.getEncoder().encodeToString(bytes));
                        fileInputStreamReader.close();
                        System.out.println(Thread.currentThread().getName()+" finished");
                    } catch (IOException e) {
                        e.printStackTrace();
                    }

                }
                else
                {
                    System.out.println("File not exists");

                }
        }


}
public class Consumer implements Runnable {

     private BlockingQueue<String> queue;
     File file=null;

        public Consumer(BlockingQueue<String> q){
            queue = q;
        }

        public void run(){


            while(true){
                String line = queue.poll();

                if(line == null && !Controller.isProducerAlive())
                    return;

                if(line != null){
                    System.out.println(Thread.currentThread().getName()+" processing line: "+line);
                    //Do something with the line here like see if it contains a string
                }

            }
        }

}
jsonMap = new HashMap<>();
jsonMap.put("id", doc.getId());
jsonMap.put("app_language", doc.getApp_language());
jsonMap.put("fileContent", result);

String id=Long.toString(doc.getId());

IndexRequest request = new IndexRequest(ATTACHMENT, "doc", id )
        .source(jsonMap)
        .setPipeline(ATTACHMENT);