Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/19.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala Spark中的各种连接类型是什么?_Scala_Apache Spark_Apache Spark Sql_Spark Dataframe_Apache Spark 2.0 - Fatal编程技术网

Scala Spark中的各种连接类型是什么?

Scala Spark中的各种连接类型是什么?,scala,apache-spark,apache-spark-sql,spark-dataframe,apache-spark-2.0,Scala,Apache Spark,Apache Spark Sql,Spark Dataframe,Apache Spark 2.0,我查看了文档,它说支持以下连接类型: 要执行的联接类型。默认内部。必须是其中之一:内部,交叉, 外部,完全,完全外部,左,左外部,右,右外部, 左半,左反 我查看了on-SQL连接,最上面的两个答案没有提到上面的一些连接,例如left\u-semi和left\u-anti。它们在Spark中是什么意思?下面是一个简单的说明性实验: import org.apache.spark.sql._ object SparkSandbox extends App { implicit val spa

我查看了文档,它说支持以下连接类型:

要执行的联接类型。默认内部。必须是其中之一:内部,交叉, 外部,完全,完全外部,左,左外部,右,右外部, 左半,左反


我查看了on-SQL连接,最上面的两个答案没有提到上面的一些连接,例如
left\u-semi
left\u-anti
。它们在Spark中是什么意思?

下面是一个简单的说明性实验:

import org.apache.spark.sql._

object SparkSandbox extends App {
  implicit val spark = SparkSession.builder().master("local[*]").getOrCreate()
  import spark.implicits._
  spark.sparkContext.setLogLevel("ERROR")

  val left = Seq((1, "A1"), (2, "A2"), (3, "A3"), (4, "A4")).toDF("id", "value")
  val right = Seq((3, "A3"), (4, "A4"), (4, "A4_1"), (5, "A5"), (6, "A6")).toDF("id", "value")

  println("LEFT")
  left.orderBy("id").show()

  println("RIGHT")
  right.orderBy("id").show()

  val joinTypes = Seq("inner", "outer", "full", "full_outer", "left", "left_outer", "right", "right_outer", "left_semi", "left_anti")

  joinTypes foreach { joinType =>
    println(s"${joinType.toUpperCase()} JOIN")
    left.join(right = right, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show()
  }
}
输出

LEFT
+---+-----+
| id|value|
+---+-----+
|  1|   A1|
|  2|   A2|
|  3|   A3|
|  4|   A4|
+---+-----+

RIGHT
+---+-----+
| id|value|
+---+-----+
|  3|   A3|
|  4|   A4|
|  4| A4_1|
|  5|   A5|
|  6|   A6|
+---+-----+

INNER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

FULL JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

FULL_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

LEFT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

LEFT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  1|   A1| null|
|  2|   A2| null|
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
+---+-----+-----+

RIGHT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4| A4_1|
|  4|   A4|   A4|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

RIGHT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|   A3|   A3|
|  4|   A4|   A4|
|  4|   A4| A4_1|
|  5| null|   A5|
|  6| null|   A6|
+---+-----+-----+

LEFT_SEMI JOIN
+---+-----+
| id|value|
+---+-----+
|  3|   A3|
|  4|   A4|
+---+-----+

LEFT_ANTI JOIN
+---+-----+
| id|value|
+---+-----+
|  1|   A1|
|  2|   A2|
+---+-----+

我喜欢帕西克里特的例子。下面是使用Spark v2和数据帧(包括交叉连接)在Java中可能进行的转换

package net.jgp.books.sparkInAction.ch12.lab940AllJoins;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * All joins in a single app, inspired by
 * https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
 * 
 * Used in Spark in Action 2e, http://jgp.net/sia
 * 
 * @author jgp
 */
public class AllJoinsApp {

  /**
   * main() is your entry point to the application.
   * 
   * @param args
   */
  public static void main(String[] args) {
    AllJoinsApp app = new AllJoinsApp();
    app.start();
  }

  /**
   * The processing code.
   */
  private void start() {
    // Creates a session on a local master
    SparkSession spark = SparkSession.builder()
        .appName("Processing of invoices")
        .master("local")
        .getOrCreate();

    StructType schema = DataTypes.createStructType(new StructField[] {
        DataTypes.createStructField(
            "id",
            DataTypes.IntegerType,
            false),
        DataTypes.createStructField(
            "value",
            DataTypes.StringType,
            false) });

    List<Row> rows = new ArrayList<Row>();
    rows.add(RowFactory.create(1, "A1"));
    rows.add(RowFactory.create(2, "A2"));
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
    dfLeft.show();

    rows = new ArrayList<Row>();
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    rows.add(RowFactory.create(4, "A4_1"));
    rows.add(RowFactory.create(5, "A5"));
    rows.add(RowFactory.create(6, "A6"));
    Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
    dfRight.show();

    String[] joinTypes = new String[] { 
        "inner", // v2.0.0. default
        "cross", // v2.2.0
        "outer", // v2.0.0
        "full", // v2.1.1
        "full_outer", // v2.1.1
        "left", // v2.1.1
        "left_outer", // v2.0.0
        "right", // v2.1.1
        "right_outer", // v2.0.0
        "left_semi", // v2.0.0, was leftsemi before v2.1.1
        "left_anti" // v2.1.1
        };

    for (String joinType : joinTypes) {
      System.out.println(joinType.toUpperCase() + " JOIN");
      Dataset<Row> df = dfLeft.join(
          dfRight, 
          dfLeft.col("id").equalTo(dfRight.col("id")), 
          joinType);
      df.orderBy(dfLeft.col("id")).show();
    }
  }
}
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;
导入java.util.ArrayList;
导入java.util.List;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.RowFactory;
导入org.apache.spark.sql.SparkSession;
导入org.apache.spark.sql.types.DataTypes;
导入org.apache.spark.sql.types.StructField;
导入org.apache.spark.sql.types.StructType;
/**
*灵感来自
* https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
* 
*在动作2e的火花中使用,http://jgp.net/sia
* 
*@author jgp
*/
公共类AllJoinsApp{
/**
*main()是应用程序的入口点。
* 
*@param args
*/
公共静态void main(字符串[]args){
AllJoinsApp app=新建AllJoinsApp();
app.start();
}
/**
*处理代码。
*/
私有void start(){
//在本地主机上创建会话
SparkSession spark=SparkSession.builder()
.appName(“发票处理”)
.master(“本地”)
.getOrCreate();
StructType架构=数据类型。createStructType(新的StructField[]{
DataTypes.createStructField(
“身份证”,
DataTypes.IntegerType,
假),,
DataTypes.createStructField(
“价值”,
DataTypes.StringType,
假)});
列表行=新建ArrayList();
添加(RowFactory.create(1,“A1”);
添加(RowFactory.create(2,“A2”);
添加(RowFactory.create(3,“A3”);
添加(RowFactory.create(4,“A4”);
数据集dfLeft=spark.createDataFrame(行,模式);
dfLeft.show();
行=新的ArrayList();
添加(RowFactory.create(3,“A3”);
添加(RowFactory.create(4,“A4”);
添加行(RowFactory.create(4,“A4_1”);
添加(RowFactory.create(5,“A5”);
添加(RowFactory.create(6,“A6”);
数据集dfRight=spark.createDataFrame(行,模式);
dfRight.show();
字符串[]joinTypes=新字符串[]{
“内部”//v2.0.0.默认值
“交叉”,//v2.2.0
“外部”//v2.0.0
“完整”//v2.1.1
“完全外部”//v2.1.1
“左”//v2.1.1
“左外”//v2.0.0
“右”//v2.1.1
“右外”//v2.0.0
“left_semi”//v2.0.0在v2.1.1之前是leftsemi
“左/右”//v2.1.1
};
用于(字符串joinType:joinType){
System.out.println(joinType.toUpperCase()+“JOIN”);
数据集df=dfLeft.join(
对,
dfLeft.col(“id”).equalTo(dfRight.col(“id”),
接合类型);
orderBy(dfLeft.col(“id”)).show();
}
}
}

我将把这个示例放在的中。

Left Semi返回两个表中都有join键的行,但它只包括左表中的字段

Left Anti返回仅在左表中找到联接键的行

对不同联接类型的良好描述:

支持的联接类型包括:


请用交叉连接更新答案。看到为什么完全和完全外部联接没有区别了吗?@sapy,因为对于完全相同的联接类型,
“outer”
“full”
,和
“fullouter”
只是不同的别名。见代码:
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * All joins in a single app, inspired by
 * https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
 * 
 * Used in Spark in Action 2e, http://jgp.net/sia
 * 
 * @author jgp
 */
public class AllJoinsApp {

  /**
   * main() is your entry point to the application.
   * 
   * @param args
   */
  public static void main(String[] args) {
    AllJoinsApp app = new AllJoinsApp();
    app.start();
  }

  /**
   * The processing code.
   */
  private void start() {
    // Creates a session on a local master
    SparkSession spark = SparkSession.builder()
        .appName("Processing of invoices")
        .master("local")
        .getOrCreate();

    StructType schema = DataTypes.createStructType(new StructField[] {
        DataTypes.createStructField(
            "id",
            DataTypes.IntegerType,
            false),
        DataTypes.createStructField(
            "value",
            DataTypes.StringType,
            false) });

    List<Row> rows = new ArrayList<Row>();
    rows.add(RowFactory.create(1, "A1"));
    rows.add(RowFactory.create(2, "A2"));
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
    dfLeft.show();

    rows = new ArrayList<Row>();
    rows.add(RowFactory.create(3, "A3"));
    rows.add(RowFactory.create(4, "A4"));
    rows.add(RowFactory.create(4, "A4_1"));
    rows.add(RowFactory.create(5, "A5"));
    rows.add(RowFactory.create(6, "A6"));
    Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
    dfRight.show();

    String[] joinTypes = new String[] { 
        "inner", // v2.0.0. default
        "cross", // v2.2.0
        "outer", // v2.0.0
        "full", // v2.1.1
        "full_outer", // v2.1.1
        "left", // v2.1.1
        "left_outer", // v2.0.0
        "right", // v2.1.1
        "right_outer", // v2.0.0
        "left_semi", // v2.0.0, was leftsemi before v2.1.1
        "left_anti" // v2.1.1
        };

    for (String joinType : joinTypes) {
      System.out.println(joinType.toUpperCase() + " JOIN");
      Dataset<Row> df = dfLeft.join(
          dfRight, 
          dfLeft.col("id").equalTo(dfRight.col("id")), 
          joinType);
      df.orderBy(dfLeft.col("id")).show();
    }
  }
}
   inner  
    outer  
     full  
      fullouter  
       full_outer  
        leftouter  
         left  
          left_outer  
           rightouter  
            right  
             right_outer  
              leftsemi  
               left_semi  
                leftanti  
                 left_anti  
                  cross