Scala Spark中的各种连接类型是什么?
我查看了文档,它说支持以下连接类型: 要执行的联接类型。默认内部。必须是其中之一:内部,交叉, 外部,完全,完全外部,左,左外部,右,右外部, 左半,左反Scala Spark中的各种连接类型是什么?,scala,apache-spark,apache-spark-sql,spark-dataframe,apache-spark-2.0,Scala,Apache Spark,Apache Spark Sql,Spark Dataframe,Apache Spark 2.0,我查看了文档,它说支持以下连接类型: 要执行的联接类型。默认内部。必须是其中之一:内部,交叉, 外部,完全,完全外部,左,左外部,右,右外部, 左半,左反 我查看了on-SQL连接,最上面的两个答案没有提到上面的一些连接,例如left\u-semi和left\u-anti。它们在Spark中是什么意思?下面是一个简单的说明性实验: import org.apache.spark.sql._ object SparkSandbox extends App { implicit val spa
我查看了on-SQL连接,最上面的两个答案没有提到上面的一些连接,例如
left\u-semi
和left\u-anti
。它们在Spark中是什么意思?下面是一个简单的说明性实验:
import org.apache.spark.sql._
object SparkSandbox extends App {
implicit val spark = SparkSession.builder().master("local[*]").getOrCreate()
import spark.implicits._
spark.sparkContext.setLogLevel("ERROR")
val left = Seq((1, "A1"), (2, "A2"), (3, "A3"), (4, "A4")).toDF("id", "value")
val right = Seq((3, "A3"), (4, "A4"), (4, "A4_1"), (5, "A5"), (6, "A6")).toDF("id", "value")
println("LEFT")
left.orderBy("id").show()
println("RIGHT")
right.orderBy("id").show()
val joinTypes = Seq("inner", "outer", "full", "full_outer", "left", "left_outer", "right", "right_outer", "left_semi", "left_anti")
joinTypes foreach { joinType =>
println(s"${joinType.toUpperCase()} JOIN")
left.join(right = right, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show()
}
}
输出
LEFT
+---+-----+
| id|value|
+---+-----+
| 1| A1|
| 2| A2|
| 3| A3|
| 4| A4|
+---+-----+
RIGHT
+---+-----+
| id|value|
+---+-----+
| 3| A3|
| 4| A4|
| 4| A4_1|
| 5| A5|
| 6| A6|
+---+-----+
INNER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
FULL JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
FULL_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
LEFT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
LEFT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
RIGHT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
RIGHT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
LEFT_SEMI JOIN
+---+-----+
| id|value|
+---+-----+
| 3| A3|
| 4| A4|
+---+-----+
LEFT_ANTI JOIN
+---+-----+
| id|value|
+---+-----+
| 1| A1|
| 2| A2|
+---+-----+
我喜欢帕西克里特的例子。下面是使用Spark v2和数据帧(包括交叉连接)在Java中可能进行的转换
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
/**
* All joins in a single app, inspired by
* https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
*
* Used in Spark in Action 2e, http://jgp.net/sia
*
* @author jgp
*/
public class AllJoinsApp {
/**
* main() is your entry point to the application.
*
* @param args
*/
public static void main(String[] args) {
AllJoinsApp app = new AllJoinsApp();
app.start();
}
/**
* The processing code.
*/
private void start() {
// Creates a session on a local master
SparkSession spark = SparkSession.builder()
.appName("Processing of invoices")
.master("local")
.getOrCreate();
StructType schema = DataTypes.createStructType(new StructField[] {
DataTypes.createStructField(
"id",
DataTypes.IntegerType,
false),
DataTypes.createStructField(
"value",
DataTypes.StringType,
false) });
List<Row> rows = new ArrayList<Row>();
rows.add(RowFactory.create(1, "A1"));
rows.add(RowFactory.create(2, "A2"));
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
dfLeft.show();
rows = new ArrayList<Row>();
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
rows.add(RowFactory.create(4, "A4_1"));
rows.add(RowFactory.create(5, "A5"));
rows.add(RowFactory.create(6, "A6"));
Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
dfRight.show();
String[] joinTypes = new String[] {
"inner", // v2.0.0. default
"cross", // v2.2.0
"outer", // v2.0.0
"full", // v2.1.1
"full_outer", // v2.1.1
"left", // v2.1.1
"left_outer", // v2.0.0
"right", // v2.1.1
"right_outer", // v2.0.0
"left_semi", // v2.0.0, was leftsemi before v2.1.1
"left_anti" // v2.1.1
};
for (String joinType : joinTypes) {
System.out.println(joinType.toUpperCase() + " JOIN");
Dataset<Row> df = dfLeft.join(
dfRight,
dfLeft.col("id").equalTo(dfRight.col("id")),
joinType);
df.orderBy(dfLeft.col("id")).show();
}
}
}
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;
导入java.util.ArrayList;
导入java.util.List;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.RowFactory;
导入org.apache.spark.sql.SparkSession;
导入org.apache.spark.sql.types.DataTypes;
导入org.apache.spark.sql.types.StructField;
导入org.apache.spark.sql.types.StructType;
/**
*灵感来自
* https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
*
*在动作2e的火花中使用,http://jgp.net/sia
*
*@author jgp
*/
公共类AllJoinsApp{
/**
*main()是应用程序的入口点。
*
*@param args
*/
公共静态void main(字符串[]args){
AllJoinsApp app=新建AllJoinsApp();
app.start();
}
/**
*处理代码。
*/
私有void start(){
//在本地主机上创建会话
SparkSession spark=SparkSession.builder()
.appName(“发票处理”)
.master(“本地”)
.getOrCreate();
StructType架构=数据类型。createStructType(新的StructField[]{
DataTypes.createStructField(
“身份证”,
DataTypes.IntegerType,
假),,
DataTypes.createStructField(
“价值”,
DataTypes.StringType,
假)});
列表行=新建ArrayList();
添加(RowFactory.create(1,“A1”);
添加(RowFactory.create(2,“A2”);
添加(RowFactory.create(3,“A3”);
添加(RowFactory.create(4,“A4”);
数据集dfLeft=spark.createDataFrame(行,模式);
dfLeft.show();
行=新的ArrayList();
添加(RowFactory.create(3,“A3”);
添加(RowFactory.create(4,“A4”);
添加行(RowFactory.create(4,“A4_1”);
添加(RowFactory.create(5,“A5”);
添加(RowFactory.create(6,“A6”);
数据集dfRight=spark.createDataFrame(行,模式);
dfRight.show();
字符串[]joinTypes=新字符串[]{
“内部”//v2.0.0.默认值
“交叉”,//v2.2.0
“外部”//v2.0.0
“完整”//v2.1.1
“完全外部”//v2.1.1
“左”//v2.1.1
“左外”//v2.0.0
“右”//v2.1.1
“右外”//v2.0.0
“left_semi”//v2.0.0在v2.1.1之前是leftsemi
“左/右”//v2.1.1
};
用于(字符串joinType:joinType){
System.out.println(joinType.toUpperCase()+“JOIN”);
数据集df=dfLeft.join(
对,
dfLeft.col(“id”).equalTo(dfRight.col(“id”),
接合类型);
orderBy(dfLeft.col(“id”)).show();
}
}
}
我将把这个示例放在的中。Left Semi返回两个表中都有join键的行,但它只包括左表中的字段 Left Anti返回仅在左表中找到联接键的行 对不同联接类型的良好描述: 支持的联接类型包括:
请用交叉连接更新答案。看到为什么完全和完全外部联接没有区别了吗?@sapy,因为对于完全相同的联接类型,
“outer”
,“full”
,和“fullouter”
只是不同的别名。见代码:
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
/**
* All joins in a single app, inspired by
* https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
*
* Used in Spark in Action 2e, http://jgp.net/sia
*
* @author jgp
*/
public class AllJoinsApp {
/**
* main() is your entry point to the application.
*
* @param args
*/
public static void main(String[] args) {
AllJoinsApp app = new AllJoinsApp();
app.start();
}
/**
* The processing code.
*/
private void start() {
// Creates a session on a local master
SparkSession spark = SparkSession.builder()
.appName("Processing of invoices")
.master("local")
.getOrCreate();
StructType schema = DataTypes.createStructType(new StructField[] {
DataTypes.createStructField(
"id",
DataTypes.IntegerType,
false),
DataTypes.createStructField(
"value",
DataTypes.StringType,
false) });
List<Row> rows = new ArrayList<Row>();
rows.add(RowFactory.create(1, "A1"));
rows.add(RowFactory.create(2, "A2"));
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
dfLeft.show();
rows = new ArrayList<Row>();
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
rows.add(RowFactory.create(4, "A4_1"));
rows.add(RowFactory.create(5, "A5"));
rows.add(RowFactory.create(6, "A6"));
Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
dfRight.show();
String[] joinTypes = new String[] {
"inner", // v2.0.0. default
"cross", // v2.2.0
"outer", // v2.0.0
"full", // v2.1.1
"full_outer", // v2.1.1
"left", // v2.1.1
"left_outer", // v2.0.0
"right", // v2.1.1
"right_outer", // v2.0.0
"left_semi", // v2.0.0, was leftsemi before v2.1.1
"left_anti" // v2.1.1
};
for (String joinType : joinTypes) {
System.out.println(joinType.toUpperCase() + " JOIN");
Dataset<Row> df = dfLeft.join(
dfRight,
dfLeft.col("id").equalTo(dfRight.col("id")),
joinType);
df.orderBy(dfLeft.col("id")).show();
}
}
}
inner
outer
full
fullouter
full_outer
leftouter
left
left_outer
rightouter
right
right_outer
leftsemi
left_semi
leftanti
left_anti
cross