Java apachespark请求数据充实

Java apachespark请求数据充实,java,http,apache-spark,asynchronous,Java,Http,Apache Spark,Asynchronous,我对apachespark还很陌生。我想得到一些指导,如果这是一个ApacheSpark工作的坏做法 目标是向外部RESTAPI发出请求,并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求,并将http响应作为RDD返回 下面是我正在尝试做的一个例子 public final class AsyncSparkJob implements Serializable { // Java-friendly version of SparkContext // Used to

我对apachespark还很陌生。我想得到一些指导,如果这是一个ApacheSpark工作的坏做法

目标是向外部RESTAPI发出请求,并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求,并将http响应作为RDD返回

下面是我正在尝试做的一个例子

public final class AsyncSparkJob implements Serializable {

// Java-friendly version of SparkContext
// Used to return JavaRDDs and works with Java Collections.
private static JavaSparkContext sc;
// AsyncSparkJob - constructor
public AsyncSparkJob(JavaSparkContext sc) {
    // initialize the spark context
    this.sc = sc;
}

// run - execute the spark transformations and actions
public void run(String filePath ) {
    System.out.println("Starting spark job");
    JavaRDD<String> inputFile = this.sc.textFile(filePath);
    // Send a partition of http requests to each executor
    Long results = inputFile.mapPartitions(new FlatMapFunction<Iterator<String>, HttpResponse>(){
        // call - FlatMapFunction call implementation
        public Iterator<HttpResponse> call(Iterator<String> stringIterator) throws Exception {
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(300000)
                    .setConnectTimeout(300000).build();

            CloseableHttpAsyncClient httpClient = HttpAsyncClients.custom()
                    .setDefaultRequestConfig(requestConfig).setMaxConnTotal(500).setMaxConnPerRoute(500)
                    .build();
            httpClient.start();
            List<HttpResponse> httpResponseList = new LinkedList<HttpResponse>();
            try {
                List<Future<HttpResponse>> futureResponseList = new LinkedList<Future<HttpResponse>>();
                // As long as we have values in the Iterator keep looping
                while (stringIterator.hasNext()) {
                    String uri = stringIterator.next();
                    HttpGet request = new HttpGet(uri);
                    Future<HttpResponse> futureResponse = httpClient.execute(request, new FutureCallback<HttpResponse>() {
                        public void completed(HttpResponse httpResponse) {
                            System.out.println("Completed request");
                        }

                        public void failed(Exception e) {
                            System.out.println("failed" + e);
                        }

                        public void cancelled() {
                            System.out.println("cancelled");
                        }
                    });
                    futureResponseList.add(futureResponse);
                }
                // Now that we have submitted all of the responses we can start
                // looking threw and trying to read the response.
                for (Future<HttpResponse> futureResponse : futureResponseList) {
                /* This will cause a block. However We have already submitted
                all of our requests. So if we block once we should expect to see less
                often blocks when reading from the "future" responses;
                 */
                    httpResponseList.add(futureResponse.get());
                }
            } catch ( Exception e ) {
                System.out.println("Caught " + e);
            }finally {
                httpClient.close();
            }
            return httpResponseList.iterator();
        }
    }).count();
    System.out.println("Final result count : " + results);
}

public static void main( String[] args ) {
    // Init the spark context
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("AsyncSparkJob"));
    // Create the spark job
    AsyncSparkJob asj = new AsyncSparkJob(sc);
    asj.run(args[0]);
    System.out.println("Done");
}
public final类AsyncSparkJob实现可序列化{
//SparkContext的Java友好版本
//用于返回JavaRDD并使用Java集合。
私有静态JavaSparkContext sc;
//AsyncSparkJob-构造函数
公共异步SparkJob(JavaSparkContext sc){
//初始化spark上下文
这个.sc=sc;
}
//运行-执行spark转换和操作
公共无效运行(字符串文件路径){
System.out.println(“启动火花作业”);
JavaRDD inputFile=this.sc.textFile(文件路径);
//向每个执行器发送http请求的分区
Long results=inputFile.mapPartitions(新的FlatMapFunction(){
//调用-FlatMapFunction调用实现
公共迭代器调用(迭代器stringIterator)引发异常{
RequestConfig RequestConfig=RequestConfig.custom()
.setSocketTimeout(300000)
.setConnectTimeout(300000).build();
CloseableHttpAsyncClient httpClient=HttpAsyncClient.custom()
.setDefaultRequestConfig(requestConfig).setMaxConnTotal(500).setMaxConnPerRoute(500)
.build();
httpClient.start();
List httpResponseList=新建LinkedList();
试一试{
List futureResponseList=新建LinkedList();
//只要迭代器中有值,就保持循环
while(stringIterator.hasNext()){
字符串uri=stringIterator.next();
HttpGet请求=新的HttpGet(uri);
Future futureResponse=httpClient.execute(请求,new FutureCallback()){
公共无效已完成(HttpResponse HttpResponse){
System.out.println(“完成的请求”);
}
公共作废失败(异常e){
系统输出打印项次(“失败”+e);
}
公众假期取消(){
系统输出打印项次(“取消”);
}
});
futureResponseList.add(futureResponse);
}
//既然我们已经提交了所有的回复,我们就可以开始了
//看着并试图阅读回复。
for(未来响应:未来响应列表){
/*这将导致阻塞。但我们已提交
我们所有的请求。因此,如果我们阻止一次,我们会看到更少的请求
阅读“未来”回答时,经常会出现障碍;
*/
添加(futuresponse.get());
}
}捕获(例外e){
系统输出打印项次(“捕获”+e);
}最后{
httpClient.close();
}
返回httpResponseList.iterator();
}
}).count();
System.out.println(“最终结果计数:+结果”);
}
公共静态void main(字符串[]args){
//初始化spark上下文
JavaSparkContext sc=新的JavaSparkContext(新的SparkConf().setAppName(“AsyncSparkJob”);
//创建spark作业
AsyncSparkJob asj=新的AsyncSparkJob(sc);
asj.run(参数[0]);
系统输出打印项次(“完成”);
}
}

这是一个有效的用例吗