Java 当多个线程共享相同的spark上下文时,spark应用程序不会停止
我试图重现我所面临的问题。我的问题陈述-文件夹中存在多个文件。我需要对每个文件进行字数统计并打印结果。每个文件都应该并行处理!当然,并行性是有限制的。我已经编写了以下代码来完成它。运行正常。群集正在安装mapR的spark。群集具有spark.scheduler.mode=FIFO Q1-是否有更好的方法来完成上述任务 问题2-我观察到应用程序即使在运行时也不会停止 已完成可用文件的字数计算。我不能 想办法对付它吗Java 当多个线程共享相同的spark上下文时,spark应用程序不会停止,java,apache-spark,Java,Apache Spark,我试图重现我所面临的问题。我的问题陈述-文件夹中存在多个文件。我需要对每个文件进行字数统计并打印结果。每个文件都应该并行处理!当然,并行性是有限制的。我已经编写了以下代码来完成它。运行正常。群集正在安装mapR的spark。群集具有spark.scheduler.mode=FIFO Q1-是否有更好的方法来完成上述任务 问题2-我观察到应用程序即使在运行时也不会停止 已完成可用文件的字数计算。我不能 想办法对付它吗 package groupId.artifactId; 导入java.util.
package groupId.artifactId;
导入java.util.ArrayList;
导入java.util.Iterator;
导入java.util.List;
导入java.util.concurrent.ExecutionException;
导入java.util.concurrent.ExecutorService;
导入java.util.concurrent.Executors;
导入java.util.concurrent.Future;
导入java.util.concurrent.TimeUnit;
导入org.apache.spark.SparkConf;
导入org.apache.spark.api.java.JavaSparkContext;
公共类执行者{
/**
*@param args
*/
公共静态void main(字符串[]args){
最终int threadPoolSize=5;
SparkConf SparkConf=new SparkConf().setMaster(“纱线客户端”).setAppName(“跟踪器”).set(“spark.ui.port”,“0”);
JavaSparkContext jsc=新的JavaSparkContext(sparkConf);
ExecutorService executor=Executors.newFixedThreadPool(threadPoolSize);
List listOfFuture=new ArrayList();
对于(int i=0;i<20;i++){
if(listOfFuture.size()
}
package groupId.artifactId;
导入java.io.Serializable;
导入java.util.array;
导入java.util.concurrent.Callable;
导入org.apache.spark.api.java.javapairdd;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.JavaSparkContext;
导入org.apache.spark.api.java.function.FlatMapFunction;
导入org.apache.spark.api.java.function.Function2;
导入org.apache.spark.api.java.function.PairFunction;
导入scala.Tuple2;
公共类FlexiWordCount实现可调用、可序列化{
私有静态最终长serialVersionUID=1L;
私有JavaSparkContext jsc;
私有int文件ID;
公共FlexiWordCount(JavaSparkContext jsc,int fileId){
超级();
this.jsc=jsc;
this.fileId=fileId;
}
私有静态类缩减实现Function2{
@凌驾
公共整数调用(整数i1、整数i2){
返回i1+i2;
}
}
私有静态类KVPair实现PairFunction{
@凌驾
公共元组2调用(字符串参数)
抛出异常{
返回新的Tuple2(参数,1);
}
}
私有静态类平坦器实现FlatMapFunction{
@凌驾
公共Iterable调用(字符串s){
返回数组.asList(s.split(“”);
}
}
@凌驾
公共对象调用()引发异常{
javarddjrd=jsc.textFile(“/root/folder/experiment979/”+fileId+”.txt”);
System.out.println(“fileId=“+fileId”)的内部调用();
JavaRDD words=jrd.flatMap(新的Flatter());
javapairdd ones=words.mapToPair(新的KVPair());
javapairdd counts=one.reduceByKey(new reduce());
返回计数。collect();
}
}
}
为什么程序不能自动关闭
回答:您尚未关闭Sparkcontex,请尝试将main方法更改为:
public static void main(String[] args) {
final int threadPoolSize = 5;
SparkConf sparkConf = new SparkConf().setMaster("yarn-client").setAppName("Tracker").set("spark.ui.port","0");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
ExecutorService executor = Executors.newFixedThreadPool(threadPoolSize);
List<Future> listOfFuture = new ArrayList<Future>();
for (int i = 0; i < 20; i++) {
if (listOfFuture.size() < threadPoolSize) {
FlexiWordCount flexiWordCount = new FlexiWordCount(jsc, i);
Future future = executor.submit(flexiWordCount);
listOfFuture.add(future);
} else {
boolean allFutureDone = false;
while (!allFutureDone) {
allFutureDone = checkForAllFuture(listOfFuture);
System.out.println("Threads not completed yet!");
try {
Thread.sleep(2000);//waiting for 2 sec, before next check
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
printFutureResult(listOfFuture);
System.out.println("printing of future done");
listOfFuture.clear();
System.out.println("future list got cleared");
}
}
try {
executor.awaitTermination(5, TimeUnit.MINUTES);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
jsc.stop()
}
publicstaticvoidmain(字符串[]args){
最终int threadPoolSize=5;
SparkConf SparkConf=新火花
package groupId.artifactId;
import java.io.Serializable;
import java.util.Arrays;
import java.util.concurrent.Callable;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class FlexiWordCount implements Callable<Object>,Serializable {
private static final long serialVersionUID = 1L;
private JavaSparkContext jsc;
private int fileId;
public FlexiWordCount(JavaSparkContext jsc, int fileId) {
super();
this.jsc = jsc;
this.fileId = fileId;
}
private static class Reduction implements Function2<Integer, Integer, Integer>{
@Override
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
}
private static class KVPair implements PairFunction<String, String, Integer>{
@Override
public Tuple2<String, Integer> call(String paramT)
throws Exception {
return new Tuple2<String, Integer>(paramT, 1);
}
}
private static class Flatter implements FlatMapFunction<String, String>{
@Override
public Iterable<String> call(String s) {
return Arrays.asList(s.split(" "));
}
}
@Override
public Object call() throws Exception {
JavaRDD<String> jrd = jsc.textFile("/root/folder/experiment979/" + fileId +".txt");
System.out.println("inside call() for fileId = " + fileId);
JavaRDD<String> words = jrd.flatMap(new Flatter());
JavaPairRDD<String, Integer> ones = words.mapToPair(new KVPair());
JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Reduction());
return counts.collect();
}
}
}
public static void main(String[] args) {
final int threadPoolSize = 5;
SparkConf sparkConf = new SparkConf().setMaster("yarn-client").setAppName("Tracker").set("spark.ui.port","0");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
ExecutorService executor = Executors.newFixedThreadPool(threadPoolSize);
List<Future> listOfFuture = new ArrayList<Future>();
for (int i = 0; i < 20; i++) {
if (listOfFuture.size() < threadPoolSize) {
FlexiWordCount flexiWordCount = new FlexiWordCount(jsc, i);
Future future = executor.submit(flexiWordCount);
listOfFuture.add(future);
} else {
boolean allFutureDone = false;
while (!allFutureDone) {
allFutureDone = checkForAllFuture(listOfFuture);
System.out.println("Threads not completed yet!");
try {
Thread.sleep(2000);//waiting for 2 sec, before next check
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
printFutureResult(listOfFuture);
System.out.println("printing of future done");
listOfFuture.clear();
System.out.println("future list got cleared");
}
}
try {
executor.awaitTermination(5, TimeUnit.MINUTES);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
jsc.stop()
}