Apache spark Spark未将负载均匀地分配给任务
最后一个阶段非常慢。我怀疑问题在于记录在分区和任务之间的分布不均匀。有什么办法可以强迫你这么做吗Apache spark Spark未将负载均匀地分配给任务,apache-spark,Apache Spark,最后一个阶段非常慢。我怀疑问题在于记录在分区和任务之间的分布不均匀。有什么办法可以强迫你这么做吗 公共静态JavaRDD getJsonUserIdVideoIdRDD(JavaRDD缓存, JavaPairRDD userIdClusterId, int numPartitions,String outDir){ /* 将JavaRDD转换为javapairdd */ javapairdd userIdDmRating=cachedRating.mapToPair(新PairFunction
公共静态JavaRDD getJsonUserIdVideoIdRDD(JavaRDD缓存,
JavaPairRDD userIdClusterId,
int numPartitions,String outDir){
/*
将JavaRDD转换为javapairdd
*/
javapairdd userIdDmRating=cachedRating.mapToPair(新PairFunction(){
公共Tuple2调用(Rating dmRating)引发异常{
返回新的Tuple2(dmRating.user(),(dmRating)dmRating);
}
});
/*
通过密钥将此RDD与userIdClusterID RDD连接起来
*/
javapairdd userId\u T\u clustridmrating=userIdClusterId.join(userIdDmRating,numPartitions);
//将clusterId提取到videoId映射
javapairdd clusterIdVideoId=userId\u T\u clusterIdDmRating.mapToPair(新PairFunction(){
公共Tuple2调用(Tuple2 UserIDdRatingClusterId)引发异常{
整数userId=useridmratingclusterid._1();
Tuple2 dmRatingClusterId=userIdDmRatingClusterId._2();
返回新的Tuple2(dmRatingClusterId.\u 1(),dmRatingClusterId.\u 2().product());
}
});
//////
///计算群集中某个视频的受欢迎程度
javapairdd clusterIdVideoIdStrInt=clusterIdVideoId.mapToPair(新的PairFunction(){
@凌驾
公共Tuple2调用(Tuple2 videoIdClusterId)引发异常{
返回新的Tuple2(String.format(“%d:%d”,videoIdClusterId.\u 1(),videoIdClusterId.\u 2()),1);
}
});
javapairdd clustredvideoidstrcount=clustredvideoidstrint.reduceByKey(新函数2(){
@凌驾
公共整数调用(整数v1、整数v2)引发异常{
返回v1+v2;
}
});
///
javapairdd clusterId\u T\u videoIdCount=clusterIdVideoIdStrCount.mapToPair(新的PairFunction(){
@凌驾
公共Tuple2调用(Tuple2 clusterIdVideoIdStrCount)引发异常{
String[]splits=clustredvideoidstrcount.\u 1().split(“:”);
试一试{
如果(拆分长度==2){
int clusterId=Integer.parseInt(拆分[0]);
int videoId=Integer.parseInt(拆分[1]);
返回新的Tuple2(clusterId,newtuple2(videoId,clusterIdVideoIdStrCount._2());
}否则{
//不应该发生
LOGGER.error(“无法将{}拆分为两个,使用:作为分隔符!”,clusterIdVideoIdStrCount._1());
}
}捕获(NumberFormatException ex){
LOGGER.error(例如getMessage());
}
返回新的Tuple2(-1,新的Tuple2(-1,-1));
}
});
javapairdd clusterIdVideoIdGrouped=clusterId\u T\u videoIdCount.groupByKey();
javapairdd clusterIdDmRating=userId\u T\u clusterIdDmRating.mapToPair(新的PairFunction(){
@凌驾
公共Tuple2调用(Tuple2 userId\u T\u clusteridmrating)引发异常{
返回userId_T_clusteridmrating._2();
}
});
javapairdd clusterId\u T\u DmRatingVideoIds=clusterIdDmRating.join(clusterIdVideoIdGrouped,numPartitions);
javapairdd userIdStringRDD=clusterId\u T\u DmRatingVideoIds.mapToPair(新的PairFunction(){
@凌驾
公共Tuple2调用(tuple2v1)引发异常{
int clusterId=v1._1();
Tuple2 tuple=v1._2();
DmRating=元组。_1();
Iterable VideoSconts=元组;
StringBuilder recosStr=新的StringBuilder();
布尔值=假;
用于(Tuple2 videoCount:VideoSconts){
if(appendComa)recosStr.append(“,”);
记录追加(“{”);
recosStr.append(“\'video\u id\”:”);
recosStr.append(videoCount._1());
记录追加(“,”);
recosStr.append(“\”count\“:”);
recosStr.append(videoCount._2());
recosStr.append(“}”);
appendComa=true;
}
String val=String.format(“{\'user\u id\”:\%s\”、\'v1st\“:\%s\”、\'redis\u uid\”:%s、\'cluster\u id\”:%d、\'recommendations\”:[%s]})、rating.dmUserId、rating.dmV1stStr、rating.user()、clustrid、recosStr);
返回新的Tuple2(rating.user(),val);
}
});
javapairdd groupedRdd=userIdStringRDD.groupByKey(numPartitions);
JavaRDD jsonStringRdd=groupedRdd.map(新函数(){
@凌驾
公共字符串调用(tuple2v1)引发异常{
对于(字符串str:v1._2()){
返回str;
}
LOGGER.error(“无法从iterable获取字符串,因此返回空”);
返回“”;
}
});
//info(“RDD中的项目数:{}”,jsonStringRDD.count());
//返回jsonStringRDD.persist(仅限StorageLevel.MEMORY_\u SER_2());
info(“将数据重新划分为{}”,numPartitions);
jsonStringRdd.cache().saveAsTextFile(outDir);
返回jsonStringRdd;
}
群集大小:
1.主机:16 CPU,32GB
2.工人4:32CPU、102GB、4X375GB SSD驱动器
我将代码改为使用数据帧。还是一样的问题吗
public static void saveAlsKMeansRecosAsParquet(JavaPairRDD<Integer, Tuple2<DmRating, Integer>> userIdRatingClusterIdRDD,
int numPartitions,
JavaSparkContext javaSparkContext,
String outdir){
JavaRDD<DmRating> dmRatingJavaRDD = userIdRatingClusterIdRDD.map(new Function<Tuple2<Integer, Tuple2<DmRating, Integer>>, DmRating>() {
public DmRating call(Tuple2<Integer, Tuple2<DmRating, Integer>> v1) throws Exception {
//Integer userId = v1._1();
Tuple2<DmRating, Integer> values = v1._2();
DmRating rating = values._1();
Integer clusterId = values._2();
rating.setClusterId(clusterId);
rating.setVideoId(rating.product());
rating.setV1stOrUserId((rating.userId== null || rating.userId.isEmpty())? rating.v1stId : rating.userId);
rating.setRedisId(rating.user());
return rating;
//return String.format("{\"clusterId\": %s,\"userId\": %s, \"userId\":\"%s\", \"videoId\": %s}", clusterId, userId, rating.userId, rating.product());
}
});
SQLContext sqlContext = new SQLContext(javaSparkContext);
DataFrame dmRatingDF = sqlContext.createDataFrame(dmRatingJavaRDD, DmRating.class);
dmRatingDF.registerTempTable("dmrating");
DataFrame clusterIdVideoIdDF = sqlContext.sql("SELECT clusterId, videoId FROM dmrating").cache();
DataFrame rolledupClusterIdVideoIdDF = clusterIdVideoIdDF.rollup("clusterId","videoId").count().cache();
DataFrame clusterIdUserIdDF = sqlContext.sql("SELECT clusterId, userId, redisId, v1stId FROM dmrating").distinct().cache();
JavaRDD<Row> rolledUpRDD = rolledupClusterIdVideoIdDF.toJavaRDD();
JavaRDD<Row> filteredRolledUpRDD = rolledUpRDD.filter(new Function<Row, Boolean>() {
@Override
public Boolean call(Row v1) throws Exception {
//make sure the size and values of the properties are correct
return !(v1.size()!=3 || v1.isNullAt(0) || v1.isNullAt(1) || v1.isNullAt(2));
}
});
JavaPairRDD<Integer, Tuple2<Integer, Integer>> clusterIdVideoIdCount = filteredRolledUpRDD.mapToPair(new PairFunction<Row, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Tuple2<Integer, Integer>> call(Row row) throws Exception {
Tuple2<Integer, Integer> videoIdCount = new Tuple2<Integer, Integer>(row.getInt(1), Long.valueOf(row.getLong(2)).intValue());
return new Tuple2<Integer, Tuple2<Integer, Integer>>(row.getInt(0),videoIdCount);
}
}).cache();
JavaPairRDD<Integer, Iterable<Tuple2<Integer, Integer>>> groupedPair = clusterIdVideoIdCount.groupByKey(numPartitions).cache();
JavaRDD<ClusterIdVideos> groupedFlat = groupedPair.map(new Function<Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>>, ClusterIdVideos>() {
@Override
public ClusterIdVideos call(Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>> v1) throws Exception {
ClusterIdVideos row = new ClusterIdVideos();
Iterable<Tuple2<Integer, Integer>> videosCounts= v1._2();
StringBuilder recosStr = new StringBuilder();
recosStr.append("[");
boolean appendComa = false;
for(Tuple2<Integer, Integer> videoCount : videosCounts){
if(appendComa) recosStr.append(",");
recosStr.append("{");
recosStr.append("\"video_id\":");
recosStr.append(videoCount._1());
recosStr.append(",");
recosStr.append("\"count\":");
recosStr.append(videoCount._2());
recosStr.append("}");
appendComa = true;
}
recosStr.append("]");
row.setClusterId(v1._1());
row.setVideos(recosStr.toString());
return row;
}
}).cache();
DataFrame groupedClusterId = sqlContext.createDataFrame(groupedFlat, ClusterIdVideos.class);
DataFrame recosDf = clusterIdUserIdDF.join(groupedClusterId, "clusterId");
recosDf.write().parquet(outdir);
}
public static void saveAlsKMeansRecosAsParquet(javapairdd userIdRatingClusterIdRDD,
国际货币基金组织,
JavaSparkContext JavaSparkContext,
字符串outdir){
JavaRDD dmRatingJavaRDD=userIdRatingClusterIdRDD.map(新函数(){
公共DmRating调用(Tuple2 v1)引发异常{
//整数userId=v1._1();
Tuple2值=v1._2();
DmRating=额定值。\u 1
public static void saveAlsKMeansRecosAsParquet(JavaPairRDD<Integer, Tuple2<DmRating, Integer>> userIdRatingClusterIdRDD,
int numPartitions,
JavaSparkContext javaSparkContext,
String outdir){
JavaRDD<DmRating> dmRatingJavaRDD = userIdRatingClusterIdRDD.map(new Function<Tuple2<Integer, Tuple2<DmRating, Integer>>, DmRating>() {
public DmRating call(Tuple2<Integer, Tuple2<DmRating, Integer>> v1) throws Exception {
//Integer userId = v1._1();
Tuple2<DmRating, Integer> values = v1._2();
DmRating rating = values._1();
Integer clusterId = values._2();
rating.setClusterId(clusterId);
rating.setVideoId(rating.product());
rating.setV1stOrUserId((rating.userId== null || rating.userId.isEmpty())? rating.v1stId : rating.userId);
rating.setRedisId(rating.user());
return rating;
//return String.format("{\"clusterId\": %s,\"userId\": %s, \"userId\":\"%s\", \"videoId\": %s}", clusterId, userId, rating.userId, rating.product());
}
});
SQLContext sqlContext = new SQLContext(javaSparkContext);
DataFrame dmRatingDF = sqlContext.createDataFrame(dmRatingJavaRDD, DmRating.class);
dmRatingDF.registerTempTable("dmrating");
DataFrame clusterIdVideoIdDF = sqlContext.sql("SELECT clusterId, videoId FROM dmrating").cache();
DataFrame rolledupClusterIdVideoIdDF = clusterIdVideoIdDF.rollup("clusterId","videoId").count().cache();
DataFrame clusterIdUserIdDF = sqlContext.sql("SELECT clusterId, userId, redisId, v1stId FROM dmrating").distinct().cache();
JavaRDD<Row> rolledUpRDD = rolledupClusterIdVideoIdDF.toJavaRDD();
JavaRDD<Row> filteredRolledUpRDD = rolledUpRDD.filter(new Function<Row, Boolean>() {
@Override
public Boolean call(Row v1) throws Exception {
//make sure the size and values of the properties are correct
return !(v1.size()!=3 || v1.isNullAt(0) || v1.isNullAt(1) || v1.isNullAt(2));
}
});
JavaPairRDD<Integer, Tuple2<Integer, Integer>> clusterIdVideoIdCount = filteredRolledUpRDD.mapToPair(new PairFunction<Row, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Tuple2<Integer, Integer>> call(Row row) throws Exception {
Tuple2<Integer, Integer> videoIdCount = new Tuple2<Integer, Integer>(row.getInt(1), Long.valueOf(row.getLong(2)).intValue());
return new Tuple2<Integer, Tuple2<Integer, Integer>>(row.getInt(0),videoIdCount);
}
}).cache();
JavaPairRDD<Integer, Iterable<Tuple2<Integer, Integer>>> groupedPair = clusterIdVideoIdCount.groupByKey(numPartitions).cache();
JavaRDD<ClusterIdVideos> groupedFlat = groupedPair.map(new Function<Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>>, ClusterIdVideos>() {
@Override
public ClusterIdVideos call(Tuple2<Integer, Iterable<Tuple2<Integer, Integer>>> v1) throws Exception {
ClusterIdVideos row = new ClusterIdVideos();
Iterable<Tuple2<Integer, Integer>> videosCounts= v1._2();
StringBuilder recosStr = new StringBuilder();
recosStr.append("[");
boolean appendComa = false;
for(Tuple2<Integer, Integer> videoCount : videosCounts){
if(appendComa) recosStr.append(",");
recosStr.append("{");
recosStr.append("\"video_id\":");
recosStr.append(videoCount._1());
recosStr.append(",");
recosStr.append("\"count\":");
recosStr.append(videoCount._2());
recosStr.append("}");
appendComa = true;
}
recosStr.append("]");
row.setClusterId(v1._1());
row.setVideos(recosStr.toString());
return row;
}
}).cache();
DataFrame groupedClusterId = sqlContext.createDataFrame(groupedFlat, ClusterIdVideos.class);
DataFrame recosDf = clusterIdUserIdDF.join(groupedClusterId, "clusterId");
recosDf.write().parquet(outdir);
}
//convert json string to DF
DataFrame groupedClusterId = sqlContext.read().json(groupedFlat.rdd());
Broadcast<DataFrame> broadcastDataFrame= javaSparkContext.broadcast(groupedClusterId);
DataFrame recosDf = clusterIdUserIdDF.join(broadcastDataFrame.value(),"clusterId");
recosDf.write().parquet(outdir);