Java apachespark请求数据充实_Java_Http_Apache Spark_Asynchronous

Java apachespark请求数据充实

java http apache-spark asynchronous

Java apachespark请求数据充实,java,http,apache-spark,asynchronous,Java,Http,Apache Spark,Asynchronous,我对apachespark还很陌生。我想得到一些指导，如果这是一个ApacheSpark工作的坏做法目标是向外部RESTAPI发出请求，并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求，并将http响应作为RDD返回下面是我正在尝试做的一个例子 public final class AsyncSparkJob implements Serializable { // Java-friendly version of SparkContext // Used to

我对apachespark还很陌生。我想得到一些指导，如果这是一个ApacheSpark工作的坏做法

目标是向外部RESTAPI发出请求，并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求，并将http响应作为RDD返回

下面是我正在尝试做的一个例子

public final class AsyncSparkJob implements Serializable {

// Java-friendly version of SparkContext
// Used to return JavaRDDs and works with Java Collections.
private static JavaSparkContext sc;
// AsyncSparkJob - constructor
public AsyncSparkJob(JavaSparkContext sc) {
    // initialize the spark context
    this.sc = sc;
}

// run - execute the spark transformations and actions
public void run(String filePath ) {
    System.out.println("Starting spark job");
    JavaRDD<String> inputFile = this.sc.textFile(filePath);
    // Send a partition of http requests to each executor
    Long results = inputFile.mapPartitions(new FlatMapFunction<Iterator<String>, HttpResponse>(){
        // call - FlatMapFunction call implementation
        public Iterator<HttpResponse> call(Iterator<String> stringIterator) throws Exception {
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(300000)
                    .setConnectTimeout(300000).build();

            CloseableHttpAsyncClient httpClient = HttpAsyncClients.custom()
                    .setDefaultRequestConfig(requestConfig).setMaxConnTotal(500).setMaxConnPerRoute(500)
                    .build();
            httpClient.start();
            List<HttpResponse> httpResponseList = new LinkedList<HttpResponse>();
            try {
                List<Future<HttpResponse>> futureResponseList = new LinkedList<Future<HttpResponse>>();
                // As long as we have values in the Iterator keep looping
                while (stringIterator.hasNext()) {
                    String uri = stringIterator.next();
                    HttpGet request = new HttpGet(uri);
                    Future<HttpResponse> futureResponse = httpClient.execute(request, new FutureCallback<HttpResponse>() {
                        public void completed(HttpResponse httpResponse) {
                            System.out.println("Completed request");
                        }

                        public void failed(Exception e) {
                            System.out.println("failed" + e);
                        }

                        public void cancelled() {
                            System.out.println("cancelled");
                        }
                    });
                    futureResponseList.add(futureResponse);
                }
                // Now that we have submitted all of the responses we can start
                // looking threw and trying to read the response.
                for (Future<HttpResponse> futureResponse : futureResponseList) {
                /* This will cause a block. However We have already submitted
                all of our requests. So if we block once we should expect to see less
                often blocks when reading from the "future" responses;
                 */
                    httpResponseList.add(futureResponse.get());
                }
            } catch ( Exception e ) {
                System.out.println("Caught " + e);
            }finally {
                httpClient.close();
            }
            return httpResponseList.iterator();
        }
    }).count();
    System.out.println("Final result count : " + results);
}

public static void main( String[] args ) {
    // Init the spark context
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("AsyncSparkJob"));
    // Create the spark job
    AsyncSparkJob asj = new AsyncSparkJob(sc);
    asj.run(args[0]);
    System.out.println("Done");
}

public final类AsyncSparkJob实现可序列化{
//SparkContext的Java友好版本
//用于返回JavaRDD并使用Java集合。
私有静态JavaSparkContext sc；
//AsyncSparkJob-构造函数
公共异步SparkJob（JavaSparkContext sc）{
//初始化spark上下文
这个.sc=sc；
}
//运行-执行spark转换和操作
公共无效运行（字符串文件路径）{
System.out.println（“启动火花作业”）；
JavaRDD inputFile=this.sc.textFile（文件路径）；
//向每个执行器发送http请求的分区
Long results=inputFile.mapPartitions（新的FlatMapFunction（）{
//调用-FlatMapFunction调用实现
公共迭代器调用（迭代器stringIterator）引发异常{
RequestConfig RequestConfig=RequestConfig.custom（）
.setSocketTimeout（300000）
.setConnectTimeout（300000）.build（）；
CloseableHttpAsyncClient httpClient=HttpAsyncClient.custom（）
.setDefaultRequestConfig（requestConfig）.setMaxConnTotal（500）.setMaxConnPerRoute（500）
.build（）；
httpClient.start（）；
List httpResponseList=新建LinkedList（）；
试一试{
List futureResponseList=新建LinkedList（）；
//只要迭代器中有值，就保持循环
while（stringIterator.hasNext（））{
字符串uri=stringIterator.next（）；
HttpGet请求=新的HttpGet（uri）；
Future futureResponse=httpClient.execute（请求，new FutureCallback（））{
公共无效已完成（HttpResponse HttpResponse）{
System.out.println（“完成的请求”）；
}
公共作废失败（异常e）{
系统输出打印项次（“失败”+e）；
}
公众假期取消（）{
系统输出打印项次（“取消”）；
}
});
futureResponseList.add（futureResponse）；
}
//既然我们已经提交了所有的回复，我们就可以开始了
//看着并试图阅读回复。
for（未来响应：未来响应列表）{
/*这将导致阻塞。但我们已提交
我们所有的请求。因此，如果我们阻止一次，我们会看到更少的请求
阅读“未来”回答时，经常会出现障碍；
*/
添加（futuresponse.get（））；
}
}捕获（例外e）{
系统输出打印项次（“捕获”+e）；
}最后{
httpClient.close（）；
}
返回httpResponseList.iterator（）；
}
}).count（）；
System.out.println（“最终结果计数：+结果”）；
}
公共静态void main（字符串[]args）{
//初始化spark上下文
JavaSparkContext sc=新的JavaSparkContext（新的SparkConf（）.setAppName（“AsyncSparkJob”）；
//创建spark作业
AsyncSparkJob asj=新的AsyncSparkJob（sc）；
asj.run（参数[0]）；
系统输出打印项次（“完成”）；
}

}

这是一个有效的用例吗