Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
将smail数据框与spark scala中的大型数据框连接起来,通过功能选择删除重复项_Scala_Apache Spark_Apache Spark Sql - Fatal编程技术网

将smail数据框与spark scala中的大型数据框连接起来,通过功能选择删除重复项

将smail数据框与spark scala中的大型数据框连接起来,通过功能选择删除重复项,scala,apache-spark,apache-spark-sql,Scala,Apache Spark,Apache Spark Sql,我有两个数据帧。大的有1900万行,小的有180K行 让我们假设在大DF中,我有下一个cols +-----------------------------------+------------+--------+ | cuid | Col2 | Col3 | +-----------------------------------+------------+--------+ | 12654467

我有两个数据帧。大的有1900万行,小的有180K行

让我们假设在大DF中,我有下一个cols

+-----------------------------------+------------+--------+
|               cuid                |    Col2    | Col3   |
+-----------------------------------+------------+--------+
| 12654467                          | Value 1    |    123 |
| 164687948                         | Value 2    |    163 |
| 456132456                         | Value 3    |     45 |
+-----------------------------------+------------+--------+
在小范围内,下一步:

+-----------------------------------+
|               cuid                |   
+-----------------------------------+
| 12654467                          |
+-----------------------------------+
并且输出必须是

+-----------------------------------+------------+--------+
|               cuid                |    Col2    | Col3   |
+-----------------------------------+------------+--------+
| 12654467                          | Value 1    |    123 |
+-----------------------------------+------------+--------+
下一步我已经试过了

val joinedDF = smallDF.join(largeDF, Seq("cuid"), "inner")
但在joinedDF中,我有超过18万行(600万行)。有人知道如何解决这个问题吗

UPD-在大DF和小DF中,我有按cuid的重复项,但它们在其他列中有不同的值,所以我不能按cuid分组


所以,我有下一个问题-我如何用其他COL进行功能选择并删除重复项?

这对我来说非常适合

让我们定义输入源:

  • big.csv
  • small.csv
守则:

import org.apache.spark.sql.*;
import scala.collection.Seq;

import java.io.Serializable;

public class Foo {

    private static final SparkSession spark = new SparkSession
            .Builder()
            .master("local[*]")
            .getOrCreate();

    public static void main(String[] args) {
        Dataset<BigRow> bigDataset = spark
                .read()
                .option("header", true)
                .csv(".\\resources\\big.csv")
                .as(Encoders.bean(BigRow.class));
        bigDataset.show();
        bigDataset.printSchema();
        Dataset<SmallRow> smallDataset  = spark
                .read()
                .option("header", true)
                .csv(".\\resources\\small.csv")
                .as(Encoders.bean(SmallRow.class));
        smallDataset.show();
        smallDataset.printSchema();


        Dataset<Row> joined = bigDataset
                .join(smallDataset, "cuid");
        joined.show();
    }

    private static class SmallRow implements Serializable {
        private String cuid;

        public SmallRow() {}

        public String getCuid() {
            return cuid;
        }

        public void setCuid(String cuid) {
            this.cuid = cuid;
        }

        public SmallRow withCuid(final String cuid) {
            this.cuid = cuid;
            return this;
        }
    }

    private static class BigRow implements Serializable {
        private String cuid;
        private String col2;
        private String col3;

        public BigRow() {}

        public String getCuid() {
            return cuid;
        }

        public void setCuid(String cuid) {
            this.cuid = cuid;
        }

        public String getCol2() {
            return col2;
        }

        public void setCol2(String col2) {
            this.col2 = col2;
        }

        public String getCol3() {
            return col3;
        }

        public void setCol3(String col3) {
            this.col3 = col3;
        }

        public BigRow withCuid(final String cuid) {
            this.cuid = cuid;
            return this;
        }

        public BigRow withCol2(final String col2) {
            this.col2 = col2;
            return this;
        }

        public BigRow withCol3(final String col3) {
            this.col3 = col3;
            return this;
        }
    }

}
考虑使用
distinct
删除重复项

Dataset<Row> noDuplicates = joined
                .dropDuplicates("cuid");
        noDuplicates.show();
Dataset noDuplicates=joined
.副本(“cuid”);
noDuplicates.show();

希望这有帮助。Thomas。

也许您的cuid列中有大尺寸的重复项。尝试按cuid largeDF分组并计数,可能会有600万行。是的,在大DF中,我有id的重复项,但这个重复项在其他列中有不同的值,我不能按分组cuid@Jerdy当您执行内部连接时,您的结果将具有
不同的计数cuid,您需要在分组cuid之后找出您应该在分组中选择的行。也许应该是最后一行或者其他逻辑,试着调查你到底想得到什么。
import org.apache.spark.sql.*;
import scala.collection.Seq;

import java.io.Serializable;

public class Foo {

    private static final SparkSession spark = new SparkSession
            .Builder()
            .master("local[*]")
            .getOrCreate();

    public static void main(String[] args) {
        Dataset<BigRow> bigDataset = spark
                .read()
                .option("header", true)
                .csv(".\\resources\\big.csv")
                .as(Encoders.bean(BigRow.class));
        bigDataset.show();
        bigDataset.printSchema();
        Dataset<SmallRow> smallDataset  = spark
                .read()
                .option("header", true)
                .csv(".\\resources\\small.csv")
                .as(Encoders.bean(SmallRow.class));
        smallDataset.show();
        smallDataset.printSchema();


        Dataset<Row> joined = bigDataset
                .join(smallDataset, "cuid");
        joined.show();
    }

    private static class SmallRow implements Serializable {
        private String cuid;

        public SmallRow() {}

        public String getCuid() {
            return cuid;
        }

        public void setCuid(String cuid) {
            this.cuid = cuid;
        }

        public SmallRow withCuid(final String cuid) {
            this.cuid = cuid;
            return this;
        }
    }

    private static class BigRow implements Serializable {
        private String cuid;
        private String col2;
        private String col3;

        public BigRow() {}

        public String getCuid() {
            return cuid;
        }

        public void setCuid(String cuid) {
            this.cuid = cuid;
        }

        public String getCol2() {
            return col2;
        }

        public void setCol2(String col2) {
            this.col2 = col2;
        }

        public String getCol3() {
            return col3;
        }

        public void setCol3(String col3) {
            this.col3 = col3;
        }

        public BigRow withCuid(final String cuid) {
            this.cuid = cuid;
            return this;
        }

        public BigRow withCol2(final String col2) {
            this.col2 = col2;
            return this;
        }

        public BigRow withCol3(final String col3) {
            this.col3 = col3;
            return this;
        }
    }

}
+----+-----+----+
|cuid| col2|col3|
+----+-----+----+
|   1| what| bye|
|   4|  why|this|
|   7|music|word|
|   7|music|word|
+----+-----+----+
Dataset<Row> noDuplicates = joined
                .dropDuplicates("cuid");
        noDuplicates.show();