Java apachespark请求数据充实
我对apachespark还很陌生。我想得到一些指导,如果这是一个ApacheSpark工作的坏做法 目标是向外部RESTAPI发出请求,并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求,并将http响应作为RDD返回 下面是我正在尝试做的一个例子Java apachespark请求数据充实,java,http,apache-spark,asynchronous,Java,Http,Apache Spark,Asynchronous,我对apachespark还很陌生。我想得到一些指导,如果这是一个ApacheSpark工作的坏做法 目标是向外部RESTAPI发出请求,并在处理数据时加入响应。这需要能够处理数千个请求。我试图发出异步http请求,并将http响应作为RDD返回 下面是我正在尝试做的一个例子 public final class AsyncSparkJob implements Serializable { // Java-friendly version of SparkContext // Used to
public final class AsyncSparkJob implements Serializable {
// Java-friendly version of SparkContext
// Used to return JavaRDDs and works with Java Collections.
private static JavaSparkContext sc;
// AsyncSparkJob - constructor
public AsyncSparkJob(JavaSparkContext sc) {
// initialize the spark context
this.sc = sc;
}
// run - execute the spark transformations and actions
public void run(String filePath ) {
System.out.println("Starting spark job");
JavaRDD<String> inputFile = this.sc.textFile(filePath);
// Send a partition of http requests to each executor
Long results = inputFile.mapPartitions(new FlatMapFunction<Iterator<String>, HttpResponse>(){
// call - FlatMapFunction call implementation
public Iterator<HttpResponse> call(Iterator<String> stringIterator) throws Exception {
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(300000)
.setConnectTimeout(300000).build();
CloseableHttpAsyncClient httpClient = HttpAsyncClients.custom()
.setDefaultRequestConfig(requestConfig).setMaxConnTotal(500).setMaxConnPerRoute(500)
.build();
httpClient.start();
List<HttpResponse> httpResponseList = new LinkedList<HttpResponse>();
try {
List<Future<HttpResponse>> futureResponseList = new LinkedList<Future<HttpResponse>>();
// As long as we have values in the Iterator keep looping
while (stringIterator.hasNext()) {
String uri = stringIterator.next();
HttpGet request = new HttpGet(uri);
Future<HttpResponse> futureResponse = httpClient.execute(request, new FutureCallback<HttpResponse>() {
public void completed(HttpResponse httpResponse) {
System.out.println("Completed request");
}
public void failed(Exception e) {
System.out.println("failed" + e);
}
public void cancelled() {
System.out.println("cancelled");
}
});
futureResponseList.add(futureResponse);
}
// Now that we have submitted all of the responses we can start
// looking threw and trying to read the response.
for (Future<HttpResponse> futureResponse : futureResponseList) {
/* This will cause a block. However We have already submitted
all of our requests. So if we block once we should expect to see less
often blocks when reading from the "future" responses;
*/
httpResponseList.add(futureResponse.get());
}
} catch ( Exception e ) {
System.out.println("Caught " + e);
}finally {
httpClient.close();
}
return httpResponseList.iterator();
}
}).count();
System.out.println("Final result count : " + results);
}
public static void main( String[] args ) {
// Init the spark context
JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("AsyncSparkJob"));
// Create the spark job
AsyncSparkJob asj = new AsyncSparkJob(sc);
asj.run(args[0]);
System.out.println("Done");
}
public final类AsyncSparkJob实现可序列化{
//SparkContext的Java友好版本
//用于返回JavaRDD并使用Java集合。
私有静态JavaSparkContext sc;
//AsyncSparkJob-构造函数
公共异步SparkJob(JavaSparkContext sc){
//初始化spark上下文
这个.sc=sc;
}
//运行-执行spark转换和操作
公共无效运行(字符串文件路径){
System.out.println(“启动火花作业”);
JavaRDD inputFile=this.sc.textFile(文件路径);
//向每个执行器发送http请求的分区
Long results=inputFile.mapPartitions(新的FlatMapFunction(){
//调用-FlatMapFunction调用实现
公共迭代器调用(迭代器stringIterator)引发异常{
RequestConfig RequestConfig=RequestConfig.custom()
.setSocketTimeout(300000)
.setConnectTimeout(300000).build();
CloseableHttpAsyncClient httpClient=HttpAsyncClient.custom()
.setDefaultRequestConfig(requestConfig).setMaxConnTotal(500).setMaxConnPerRoute(500)
.build();
httpClient.start();
List httpResponseList=新建LinkedList();
试一试{
List futureResponseList=新建LinkedList();
//只要迭代器中有值,就保持循环
while(stringIterator.hasNext()){
字符串uri=stringIterator.next();
HttpGet请求=新的HttpGet(uri);
Future futureResponse=httpClient.execute(请求,new FutureCallback()){
公共无效已完成(HttpResponse HttpResponse){
System.out.println(“完成的请求”);
}
公共作废失败(异常e){
系统输出打印项次(“失败”+e);
}
公众假期取消(){
系统输出打印项次(“取消”);
}
});
futureResponseList.add(futureResponse);
}
//既然我们已经提交了所有的回复,我们就可以开始了
//看着并试图阅读回复。
for(未来响应:未来响应列表){
/*这将导致阻塞。但我们已提交
我们所有的请求。因此,如果我们阻止一次,我们会看到更少的请求
阅读“未来”回答时,经常会出现障碍;
*/
添加(futuresponse.get());
}
}捕获(例外e){
系统输出打印项次(“捕获”+e);
}最后{
httpClient.close();
}
返回httpResponseList.iterator();
}
}).count();
System.out.println(“最终结果计数:+结果”);
}
公共静态void main(字符串[]args){
//初始化spark上下文
JavaSparkContext sc=新的JavaSparkContext(新的SparkConf().setAppName(“AsyncSparkJob”);
//创建spark作业
AsyncSparkJob asj=新的AsyncSparkJob(sc);
asj.run(参数[0]);
系统输出打印项次(“完成”);
}
}
这是一个有效的用例吗