Apache spark Spark未将负载均匀地分配给任务_Apache Spark

Apache spark Spark未将负载均匀地分配给任务

apache-spark

Apache spark Spark未将负载均匀地分配给任务,apache-spark,Apache Spark,最后一个阶段非常慢。我怀疑问题在于记录在分区和任务之间的分布不均匀。有什么办法可以强迫你这么做吗公共静态JavaRDD getJsonUserIdVideoIdRDD（JavaRDD缓存， JavaPairRDD userIdClusterId， int numPartitions，String outDir）{ /* 将JavaRDD转换为javapairdd */ javapairdd userIdDmRating=cachedRating.mapToPair（新PairFunction

最后一个阶段非常慢。我怀疑问题在于记录在分区和任务之间的分布不均匀。有什么办法可以强迫你这么做吗

公共静态JavaRDD getJsonUserIdVideoIdRDD（JavaRDD缓存，
JavaPairRDD userIdClusterId，
int numPartitions，String outDir）{
/*
将JavaRDD转换为javapairdd
*/
javapairdd userIdDmRating=cachedRating.mapToPair（新PairFunction（）{
公共Tuple2调用（Rating dmRating）引发异常{
返回新的Tuple2（dmRating.user（），（dmRating）dmRating）；
}
});
/*
通过密钥将此RDD与userIdClusterID RDD连接起来
*/
javapairdd userId\u T\u clustridmrating=userIdClusterId.join（userIdDmRating，numPartitions）；
//将clusterId提取到videoId映射
javapairdd clusterIdVideoId=userId\u T\u clusterIdDmRating.mapToPair（新PairFunction（）{
公共Tuple2调用（Tuple2 UserIDdRatingClusterId）引发异常{
整数userId=useridmratingclusterid._1（）；
Tuple2 dmRatingClusterId=userIdDmRatingClusterId._2（）；
返回新的Tuple2（dmRatingClusterId.\u 1（），dmRatingClusterId.\u 2（）.product（））；
}
});
//////
///计算群集中某个视频的受欢迎程度
javapairdd clusterIdVideoIdStrInt=clusterIdVideoId.mapToPair（新的PairFunction（）{
@凌驾
公共Tuple2调用（Tuple2 videoIdClusterId）引发异常{
返回新的Tuple2（String.format（“%d:%d”，videoIdClusterId.\u 1（），videoIdClusterId.\u 2（）），1）；
}
});
javapairdd clustredvideoidstrcount=clustredvideoidstrint.reduceByKey（新函数2（）{
@凌驾
公共整数调用（整数v1、整数v2）引发异常{
返回v1+v2；
}
});
///
javapairdd clusterId\u T\u videoIdCount=clusterIdVideoIdStrCount.mapToPair（新的PairFunction（）{
@凌驾
公共Tuple2调用（Tuple2 clusterIdVideoIdStrCount）引发异常{
String[]splits=clustredvideoidstrcount.\u 1（）.split（“：”）；
试一试{
如果（拆分长度==2）{
int clusterId=Integer.parseInt（拆分[0]）；
int videoId=Integer.parseInt（拆分[1]）；
返回新的Tuple2（clusterId，newtuple2（videoId，clusterIdVideoIdStrCount._2（））；
}否则{
//不应该发生
LOGGER.error（“无法将{}拆分为两个，使用：作为分隔符！”，clusterIdVideoIdStrCount._1（））；
}
}捕获（NumberFormatException ex）{
LOGGER.error（例如getMessage（））；
}
返回新的Tuple2（-1，新的Tuple2（-1，-1））；
}
});
javapairdd clusterIdVideoIdGrouped=clusterId\u T\u videoIdCount.groupByKey（）；
javapairdd clusterIdDmRating=userId\u T\u clusterIdDmRating.mapToPair（新的PairFunction（）{
@凌驾
公共Tuple2调用（Tuple2 userId\u T\u clusteridmrating）引发异常{
返回userId_T_clusteridmrating._2（）；
}
});
javapairdd clusterId\u T\u DmRatingVideoIds=clusterIdDmRating.join（clusterIdVideoIdGrouped，numPartitions）；
javapairdd userIdStringRDD=clusterId\u T\u DmRatingVideoIds.mapToPair（新的PairFunction（）{
@凌驾
公共Tuple2调用（tuple2v1）引发异常{
int clusterId=v1._1（）；
Tuple2 tuple=v1._2（）；
DmRating=元组。_1（）；
Iterable VideoSconts=元组；
StringBuilder recosStr=新的StringBuilder（）；
布尔值=假；
用于（Tuple2 videoCount:VideoSconts）{
if（appendComa）recosStr.append（“，”）；
记录追加（“{”）；
recosStr.append（“\'video\u id\”：”）；
recosStr.append（videoCount._1（））；
记录追加（“，”）；
recosStr.append（“\”count\“：”）；
recosStr.append（videoCount._2（））；
recosStr.append（“}”）；
appendComa=true；
}
String val=String.format（“{\'user\u id\”：\%s\”、\'v1st\“：\%s\”、\'redis\u uid\”：%s、\'cluster\u id\”：%d、\'recommendations\”：[%s]}）、rating.dmUserId、rating.dmV1stStr、rating.user（）、clustrid、recosStr）；
返回新的Tuple2（rating.user（），val）；
}
});
javapairdd groupedRdd=userIdStringRDD.groupByKey（numPartitions）；
JavaRDD jsonStringRdd=groupedRdd.map（新函数（）{
@凌驾
公共字符串调用（tuple2v1）引发异常{
对于（字符串str:v1._2（））{
返回str；
}
LOGGER.error（“无法从iterable获取字符串，因此返回空”）；
返回“”；
}
});
//info（“RDD中的项目数：{}”，jsonStringRDD.count（））；
//返回jsonStringRDD.persist（仅限StorageLevel.MEMORY_\u SER_2（））；
info（“将数据重新划分为{}”，numPartitions）；
jsonStringRdd.cache（）.saveAsTextFile（outDir）；
返回jsonStringRdd；
}

群集大小： 1.主机：16 CPU，32GB 2.工人4:32CPU、102GB、4X375GB SSD驱动器

我将代码改为使用数据帧。还是一样的问题吗

public static void saveAlsKMeansRecosAsParquet(JavaPairRDD<Integer, Tuple2<DmRating, Integer>> userIdRatingClusterIdRDD,
                                                 int numPartitions,
                                                 JavaSparkContext javaSparkContext,
                                                 String outdir){

    JavaRDD<DmRating> dmRatingJavaRDD = userIdRatingClusterIdRDD.map(new Function<Tuple2<Integer, Tuple2<DmRating, Integer>>, DmRating>() {
        public DmRating call(Tuple2<Integer, Tuple2<DmRating, Integer>> v1) throws Exception {
            //Integer userId = v1._1();
            Tuple2<DmRating, Integer> values = v1._2();
            DmRating rating = values._1();
            Integer clusterId = values._2();
            rating.setClusterId(clusterId);
            rating.setVideoId(rating.product());
            rating.setV1stOrUserId((rating.userId== null || rating.userId.isEmpty())? rating.v1stId : rating.userId);
            rating.setRedisId(rating.user());
            return rating;
            //return String.format("{\"clusterId\": %s,\"userId\": %s, \"userId\":\"%s\", \"videoId\": %s}", clusterId, userId, rating.userId, rating.product());
        }
    });
    SQLContext sqlContext = new SQLContext(javaSparkContext);
    DataFrame dmRatingDF = sqlContext.createDataFrame(dmRatingJavaRDD, DmRating.class);
    dmRatingDF.registerTempTable("dmrating");
    DataFrame clusterIdVideoIdDF = sqlContext.sql("SELECT clusterId, videoId FROM dmrating").cache();
    DataFrame rolledupClusterIdVideoIdDF = clusterIdVideoIdDF.rollup("clusterId","videoId").count().cache();
    DataFrame clusterIdUserIdDF = sqlContext.sql("SELECT clusterId, userId, redisId, v1stId FROM dmrating").distinct().cache();
    JavaRDD<Row> rolledUpRDD = rolledupClusterIdVideoIdDF.toJavaRDD();
    JavaRDD<Row> filteredRolledUpRDD = rolledUpRDD.filter(new Function<Row, Boolean>() {
        @Override
        public Boolean call(Row v1) throws Exception {
            //make sure the size and values of the properties are correct
            return !(v1.size()!=3 || v1.isNullAt(0) || v1.isNullAt(1) || v1.isNullAt(2));
        }
    });

    JavaPairRDD<Integer, Tuple2<Integer, Integer>> clusterIdVideoIdCount = filteredRolledUpRDD.mapToPair(new PairFunction<Row, Integer, Tuple2<Integer, Integer>>() {
        @Override
        public Tuple2<Integer, Tuple2<Integer, Integer>> call(Row row) throws Exception {
            Tuple2<Integer, Integer> videoIdCount = new Tuple2<Integer, Integer>(row.getInt(1), Long.valueOf(row.getLong(2)).intValue());
            return new Tuple2<Integer, Tuple2<Integer, Integer>>(row.getInt(0),videoIdCount);
        }
    }).cache();
    JavaPairRDD<Integer, Iterable<Tuple2<Integer, Integer>>> groupedPair = clusterIdVideoIdCount.groupByKey(numPartitions).cache();
    JavaRDD<ClusterIdVideos> groupedFlat = groupedPair.map(new Function<Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>>, ClusterIdVideos>() {
        @Override
        public ClusterIdVideos call(Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>> v1) throws Exception {
            ClusterIdVideos row = new ClusterIdVideos();
            Iterable<Tuple2<Integer, Integer>> videosCounts= v1._2();
            StringBuilder recosStr = new StringBuilder();
            recosStr.append("[");
            boolean appendComa = false;
            for(Tuple2<Integer, Integer> videoCount : videosCounts){
                if(appendComa) recosStr.append(",");
                recosStr.append("{");
                recosStr.append("\"video_id\":");
                recosStr.append(videoCount._1());
                recosStr.append(",");
                recosStr.append("\"count\":");
                recosStr.append(videoCount._2());
                recosStr.append("}");
                appendComa = true;
            }
            recosStr.append("]");
            row.setClusterId(v1._1());
            row.setVideos(recosStr.toString());
            return row;
        }
    }).cache();

    DataFrame groupedClusterId = sqlContext.createDataFrame(groupedFlat, ClusterIdVideos.class);
    DataFrame recosDf = clusterIdUserIdDF.join(groupedClusterId, "clusterId");
    recosDf.write().parquet(outdir);
}

public static void saveAlsKMeansRecosAsParquet（javapairdd userIdRatingClusterIdRDD，
国际货币基金组织，
JavaSparkContext JavaSparkContext，
字符串outdir）{
JavaRDD dmRatingJavaRDD=userIdRatingClusterIdRDD.map（新函数（）{
公共DmRating调用（Tuple2 v1）引发异常{
//整数userId=v1._1（）；
Tuple2值=v1._2（）；
DmRating=额定值。\u 1
public static void saveAlsKMeansRecosAsParquet(JavaPairRDD<Integer, Tuple2<DmRating, Integer>> userIdRatingClusterIdRDD,
                                                 int numPartitions,
                                                 JavaSparkContext javaSparkContext,
                                                 String outdir){

    JavaRDD<DmRating> dmRatingJavaRDD = userIdRatingClusterIdRDD.map(new Function<Tuple2<Integer, Tuple2<DmRating, Integer>>, DmRating>() {
        public DmRating call(Tuple2<Integer, Tuple2<DmRating, Integer>> v1) throws Exception {
            //Integer userId = v1._1();
            Tuple2<DmRating, Integer> values = v1._2();
            DmRating rating = values._1();
            Integer clusterId = values._2();
            rating.setClusterId(clusterId);
            rating.setVideoId(rating.product());
            rating.setV1stOrUserId((rating.userId== null || rating.userId.isEmpty())? rating.v1stId : rating.userId);
            rating.setRedisId(rating.user());
            return rating;
            //return String.format("{\"clusterId\": %s,\"userId\": %s, \"userId\":\"%s\", \"videoId\": %s}", clusterId, userId, rating.userId, rating.product());
        }
    });
    SQLContext sqlContext = new SQLContext(javaSparkContext);
    DataFrame dmRatingDF = sqlContext.createDataFrame(dmRatingJavaRDD, DmRating.class);
    dmRatingDF.registerTempTable("dmrating");
    DataFrame clusterIdVideoIdDF = sqlContext.sql("SELECT clusterId, videoId FROM dmrating").cache();
    DataFrame rolledupClusterIdVideoIdDF = clusterIdVideoIdDF.rollup("clusterId","videoId").count().cache();
    DataFrame clusterIdUserIdDF = sqlContext.sql("SELECT clusterId, userId, redisId, v1stId FROM dmrating").distinct().cache();
    JavaRDD<Row> rolledUpRDD = rolledupClusterIdVideoIdDF.toJavaRDD();
    JavaRDD<Row> filteredRolledUpRDD = rolledUpRDD.filter(new Function<Row, Boolean>() {
        @Override
        public Boolean call(Row v1) throws Exception {
            //make sure the size and values of the properties are correct
            return !(v1.size()!=3 || v1.isNullAt(0) || v1.isNullAt(1) || v1.isNullAt(2));
        }
    });

    JavaPairRDD<Integer, Tuple2<Integer, Integer>> clusterIdVideoIdCount = filteredRolledUpRDD.mapToPair(new PairFunction<Row, Integer, Tuple2<Integer, Integer>>() {
        @Override
        public Tuple2<Integer, Tuple2<Integer, Integer>> call(Row row) throws Exception {
            Tuple2<Integer, Integer> videoIdCount = new Tuple2<Integer, Integer>(row.getInt(1), Long.valueOf(row.getLong(2)).intValue());
            return new Tuple2<Integer, Tuple2<Integer, Integer>>(row.getInt(0),videoIdCount);
        }
    }).cache();
    JavaPairRDD<Integer, Iterable<Tuple2<Integer, Integer>>> groupedPair = clusterIdVideoIdCount.groupByKey(numPartitions).cache();
    JavaRDD<ClusterIdVideos> groupedFlat = groupedPair.map(new Function<Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>>, ClusterIdVideos>() {
        @Override
        public ClusterIdVideos call(Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>> v1) throws Exception {
            ClusterIdVideos row = new ClusterIdVideos();
            Iterable<Tuple2<Integer, Integer>> videosCounts= v1._2();
            StringBuilder recosStr = new StringBuilder();
            recosStr.append("[");
            boolean appendComa = false;
            for(Tuple2<Integer, Integer> videoCount : videosCounts){
                if(appendComa) recosStr.append(",");
                recosStr.append("{");
                recosStr.append("\"video_id\":");
                recosStr.append(videoCount._1());
                recosStr.append(",");
                recosStr.append("\"count\":");
                recosStr.append(videoCount._2());
                recosStr.append("}");
                appendComa = true;
            }
            recosStr.append("]");
            row.setClusterId(v1._1());
            row.setVideos(recosStr.toString());
            return row;
        }
    }).cache();

    DataFrame groupedClusterId = sqlContext.createDataFrame(groupedFlat, ClusterIdVideos.class);
    DataFrame recosDf = clusterIdUserIdDF.join(groupedClusterId, "clusterId");
    recosDf.write().parquet(outdir);
}

    //convert json string to DF
    DataFrame  groupedClusterId = sqlContext.read().json(groupedFlat.rdd());
    Broadcast<DataFrame> broadcastDataFrame= javaSparkContext.broadcast(groupedClusterId);

    DataFrame recosDf = clusterIdUserIdDF.join(broadcastDataFrame.value(),"clusterId");
    recosDf.write().parquet(outdir);