Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
比较Dataframes中的列值,并返回与Spark java 8中的值不同的列名_Dataframe_Apache Spark_Java 8 - Fatal编程技术网

比较Dataframes中的列值,并返回与Spark java 8中的值不同的列名

比较Dataframes中的列值,并返回与Spark java 8中的值不同的列名,dataframe,apache-spark,java-8,Dataframe,Apache Spark,Java 8,我有两个数据帧,每个数据帧有230列,我想比较一个关键列上的两个数据帧,如果使用Spark的Java8中的列值有任何差异,那么可以从这两个数据帧中获得具有值的列名 id Col_1 Col_2 Col_3 Col_4 Col_5 1 A B C D E 2 X Y Z P Q id Col_1 Col_2 Col_3 Col_4 Col_5 1 A B6 C D E 2

我有两个数据帧,每个数据帧有230列,我想比较一个关键列上的两个数据帧,如果使用Spark的Java8中的列值有任何差异,那么可以从这两个数据帧中获得具有值的列名

id  Col_1 Col_2 Col_3 Col_4 Col_5
1    A     B      C     D     E
2    X     Y      Z     P     Q

id  Col_1 Col_2 Col_3 Col_4 Col_5
1    A     B6     C     D     E
2    X     Y      Z8    P     Q3
输出

id  Col_1 Col_2   Col_3 Col_4 Col_5
1   null  [B,B6]  null  null   null
2   null  null   [Z,Z8] null   [Q,Q3]
使用Spark和Java8

Df1.except(DF2);

StructType one = DF1.schema();

JavaPairRDD<String, Row> pair1 = DF1.toJavaRDD()
        .mapToPair(new PairFunction<Row, String, Row>() {
            public Tuple2<String, Row> call(Row row) {
                return new Tuple2<String, Row>(row.getString(0), row);
            }
        });

JavaPairRDD<String, Row> pair2 = DF2.toJavaRDD()
        .mapToPair(new PairFunction<Row, String, Row>() {
            public Tuple2<String, Row> call(Row row) {
                return new Tuple2<String, Row>(row.getString(0), row);
            }
        });

JavaPairRDD<String, Row> subs = pair1.subtractByKey(pair2);
JavaRDD<Row> rdd = subs.values();
Dataset<Row> diff = spark.createDataFrame(rdd, one);
diff.show();
Df1.除了(DF2);
StructType one=DF1.schema();
javapairdd pair1=DF1.toJavaRDD()
.mapToPair(新的PairFunction(){
公共元组2调用(行){
返回新的Tuple2(row.getString(0),row);
}
});
javapairdd pair2=DF2.toJavaRDD()
.mapToPair(新的PairFunction(){
公共元组2调用(行){
返回新的Tuple2(row.getString(0),row);
}
});
javapairdd subs=pair1.subtractByKey(pair2);
JavaRDD rdd=subs.values();
Dataset diff=spark.createDataFrame(rdd,一个);
差异显示();

请帮助。

请找到以下解决方案 我试图通过将数据帧保留为数据帧来解决这个问题,您可以找到代码解释的内联注释。 实际解决方案在下面的行//之后开始

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static org.apache.spark.sql.functions.*;


public class CompareDfs {

    public static void main(String[] args) {
        SparkSession spark = Constant.getSparkSess();


        List<String> list1 = new ArrayList<>();
        list1.add("1,A,B,C,D,E");
        list1.add("2,X,Y,Z,P,Q");

        List<String> list2 = new ArrayList<>();
        list2.add("1,A,B6,C,D,E");
        list2.add("2,X,Y,Z8,P,Q3");

        Dataset<Row> df = spark.createDataset(list1, Encoders.STRING()).toDF().selectExpr("split(value, ',')[0] as id",
                "split(value, ',')[1] as Col_1",
                "split(value, ',')[2] as Col_2",
                "split(value, ',')[3] as Col_3",
                "split(value, ',')[4] as Col_4",
                "split(value, ',')[5] as Col_5");
//        df.printSchema();
//        df.show();
        // Convert
        Dataset<Row> df1 = spark.createDataset(list2, Encoders.STRING()).toDF().selectExpr("split(value, ',')[0] as id",
                "split(value, ',')[1] as Col_1",
                "split(value, ',')[2] as Col_2",
                "split(value, ',')[3] as Col_3",
                "split(value, ',')[4] as Col_4",
                "split(value, ',')[5] as Col_5");
//        df1.printSchema();
//        df1.show();

        //Below is the solution
        List<String> columns = Arrays.asList("Col_1", "Col_2", "Col_3", "Col_4", "Col_5"); // List of columns to merge

        // inner join the 2 dataframes
        Dataset<Row> joinedDf = df.join(df1).where(df.col("id").equalTo(df1.col("id")));


        // Iterate throgh the columns
        for (String column : columns) {
            joinedDf = joinedDf
                    .withColumn(column + "_temp",
                            when(df.col(column).equalTo(df1.col(column)), null) // When and otherwise clause for column to array/nul transformation
                                    .otherwise(split(concat_ws(",", df.col(column), df1.col(column)), ",")))
                    .drop(df.col(column)) // Drop column from 1st dataframe
                    .drop(df1.col(column)) // Drop column from 2nd dataframe
                    .withColumnRenamed(column + "_temp", column); // Rename column to the result column name
        }

//                .withColumn("Col_2_t",when(df.col("Col_2").equalTo(df1.col("Col_2")), null ).otherwise(split(concat_ws(",",df.col("Col_2"),df1.col("Col_2")),",")))
        joinedDf.show();


    }
}

import org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Encoders;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.SparkSession;
导入java.util.ArrayList;
导入java.util.array;
导入java.util.List;
导入静态org.apache.spark.sql.functions.*;
公共类比较{
公共静态void main(字符串[]args){
SparkSession spark=Constant.getSparkess();
List list1=新的ArrayList();
清单1.添加(“1、A、B、C、D、E”);
清单1.添加(“2,X,Y,Z,P,Q”);
List list2=新的ArrayList();
清单2.添加(“1,A,B6,C,D,E”);
列表2.添加(“2,X,Y,Z8,P,Q3”);
Dataset df=spark.createDataset(列表1,Encoders.STRING()).toDF().selectExpr(“拆分(值),”)[0]作为id”,
“拆分(值“,”)[1]为列1”,
“拆分(值“,”)[2]为列2”,
“拆分(值“,”)[3]为列3”,
“拆分(值“,”)[4]为列4”,
“拆分(值,,”)[5]为列(5”);
//printSchema();
//df.show();
//皈依
Dataset df1=spark.createDataset(list2,Encoders.STRING()).toDF().selectExpr(“拆分(值),”)[0]作为id”,
“拆分(值“,”)[1]为列1”,
“拆分(值“,”)[2]为列2”,
“拆分(值“,”)[3]为列3”,
“拆分(值“,”)[4]为列4”,
“拆分(值,,”)[5]为列(5”);
//df1.printSchema();
//df1.show();
//下面是解决方案
List columns=Arrays.asList(“Col_1”、“Col_2”、“Col_3”、“Col_4”、“Col_5”);//要合并的列的列表
//内部连接2个数据帧
Dataset joinedDf=df.join(df1).where(df.col(“id”).equalTo(df1.col(“id”));
//通过列进行迭代
for(字符串列:列){
joinedDf=joinedDf
.withColumn(column+“_temp”,
when(df.col(column).equalTo(df1.col(column)),null)//when和other子句用于列到数组/nul转换
。否则(拆分(concat_ws(“,”,df.col(列),df1.col(列)),“,”)
.drop(df.col(column))//从第一个数据帧中删除列
.drop(df1.col(column))//从第二个数据帧中删除列
.WithColumnRename(column+“_temp”,column);//将列重命名为结果列名
}
//。使用列(“Col_2_t”,当(df.Col(“Col_2”).equalTo(df1.Col(“Col_2”))时为空)。否则(拆分(concat_ws(“,”,df.Col(“Col_2”),“df1.Col”(“Col_2”),”)
joinedDf.show();
}
}

我试图用数据帧方法解决这个问题-

List cols=Arrays.stream(df1.columns())
.map(c->{
如果(c.equalsIgnoreCase(“id”))
返回col(“a.id”);
其他的
返回数组(toScalaSeq(Arrays.asList(col(“a.”+c)、col(“b.”+c))).toBuffer()).as(c);
}).collect(Collectors.toList());
数据集processedDf=df1.as(“a”).join(df2.as(“b”)、df1.col(“id”).equalTo(df2.col(“id”))
.select(toScalaSeq(cols.toBuffer());
List cols1=Arrays.stream(df1.columns())
.map(f->{
如果(f.equalsIgnoreCase(“id”))
返回表达式(f);
其他的
返回expr(“if(大小(数组_distinct(+f+))==1,NULL,+f+”)为“+f”);
}).collect(Collectors.toList());
processedDf.select(toScalaSeq(cols1.toBuffer())
.显示(虚假);
/**
* +---+-----+-------+-------+-----+-------+
*| id | Col|u 1 | Col|u 2 | Col|u 3 | Col|u 4 | Col|u 5|
* +---+-----+-------+-------+-----+-------+
*| 1 | null |[B,B6]| null | null | null|
*| 2 | null | null |[Z,Z8]| null |[Q,Q3]|
* +---+-----+-------+-------+-----+-------+
*/

请参阅此处的完整代码-

您能检查我的答案吗?谢谢。它的工作你能接受+投票吗?很高兴在这里,请考虑投票+接受答案@ Tamil