Apache spark Spark SQL-1任务由于空值而长时间运行是连接键

Apache spark Spark SQL-1任务由于空值而长时间运行是连接键,apache-spark,pyspark-sql,Apache Spark,Pyspark Sql,我在两个表之间执行左连接,每个表有13亿条记录,但是表1中约6亿条记录的连接键为空,因此所有空记录都分配给一个任务,因此发生数据倾斜,使这一任务运行数小时 from pyspark.sql import SparkSession spark = SparkSession.builder.appName("report").enableHiveSupport() tbl1 = spark.sql("""select a.col1,b.col2,a.Col3 from table1 a left

我在两个表之间执行左连接,每个表有13亿条记录,但是表1中约6亿条记录的连接键为空,因此所有空记录都分配给一个任务,因此发生数据倾斜,使这一任务运行数小时

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("report").enableHiveSupport()

tbl1 = spark.sql("""select a.col1,b.col2,a.Col3
from table1 a
left join table2 b on a.col1 = b.col2""")

tbl1.write.mode("overwrite").saveAsTable("db.tbl3") 
没有其他联接条件&这是唯一要使用的联接键。有没有什么方法可以让spark在不同的任务中分发这些空记录,而不是一种或任何其他方法?

有一个由@Mikhail Dubkov编写的非常好的 这就解决了这个问题

我只是稍微修改了一下,以解决以下异常:

org.apache.spark.sql.AnalysisException: Reference 'id' is ambiguous, could be: id, id.; 
这里有一个例子

创建表:

case class Country(country_id: String, country_name: String)
case class Location(location_id: Int, street_address: String, city: String, country_id: String)
val countries: DataFrame = List(
      Country("CN", "China"),
      Country("UK", "United Kingdom"),
      Country("US", "United States of America"),
      Country(null, "Unknown 1"),
      Country(null, "Unknown 2"),
      Country(null, "Unknown 3"),
      Country(null, "Unknown 4"),
      Country(null, "Unknown 5"),
      Country(null, "Unknown 6")
    ).toDF()


    val locations = List(
      Location(1400, "2014 Jabberwocky Rd", "Southlake", "US"),
      Location(1500, "2011 Interiors Blvd", "San Francisco", "US"),
      Location(1700, "2004 Charade Rd", "Seattle", "US"),
      Location(2400, "8204 Arthur St", "London", "UK"),
      Location(2500, "Magdalen Centre, The Oxford Science Park", "Oxford", "UK"),
      Location(0, "Null Street", "Null City", null),
    ).toDF()
加入:

实现它的另一种方法是应用自定义提示并添加自定义规则。但我不知道这是否值得。 告诉我这是否有用

修改的
nullSkewLeftJoin

def nullSkewLeftJoin(right: DataFrame,
                         usingColumn: String,
                         skewedColumnPostFix: String = "skewed_column",
                         nullNumBuckets: Int = 10000): DataFrame = {
      val left = underlying

      val leftColumn = left.col(usingColumn)
      val rightColumn = right.col(usingColumn)

      nullSkewLeftJoin(right, leftColumn, rightColumn, skewedColumnPostFix, nullNumBuckets)
    }

    def nullSkewLeftJoin(right: DataFrame,
                         joinLeftCol: Column,
                         joinRightCol: Column,
                         skewedColumnPostFix: String ,
                         nullNumBuckets: Int): DataFrame = {

      val skewedTempColumn = s"${joinLeftCol.toString()}_$skewedColumnPostFix"

      if (underlying.columns.exists(_ equalsIgnoreCase skewedTempColumn)) {
        underlying.join(right.where(joinRightCol.isNotNull), col(skewedTempColumn) === joinRightCol, "left")
      } else {
        underlying
          .withColumn(skewedTempColumn,
            when(joinLeftCol.isNotNull, joinLeftCol).otherwise(negativeRandomWithin(nullNumBuckets)))
          .join(right.where(joinRightCol.isNotNull), col(skewedTempColumn) === joinRightCol, "left")
      }
    }
  }

再次感谢@Mikhail Dubkov

你确定这些空记录上的连接有意义吗?有时,设置另一个处理空记录的任务更接近于获得有意义的结果。是的,表A中的联接键为空,但由于是左联接,我们需要该表中的其他有效列数据。我试图将NULL¬null数据拆分为不同的表,但处理时间正在增加!
def nullSkewLeftJoin(right: DataFrame,
                         usingColumn: String,
                         skewedColumnPostFix: String = "skewed_column",
                         nullNumBuckets: Int = 10000): DataFrame = {
      val left = underlying

      val leftColumn = left.col(usingColumn)
      val rightColumn = right.col(usingColumn)

      nullSkewLeftJoin(right, leftColumn, rightColumn, skewedColumnPostFix, nullNumBuckets)
    }

    def nullSkewLeftJoin(right: DataFrame,
                         joinLeftCol: Column,
                         joinRightCol: Column,
                         skewedColumnPostFix: String ,
                         nullNumBuckets: Int): DataFrame = {

      val skewedTempColumn = s"${joinLeftCol.toString()}_$skewedColumnPostFix"

      if (underlying.columns.exists(_ equalsIgnoreCase skewedTempColumn)) {
        underlying.join(right.where(joinRightCol.isNotNull), col(skewedTempColumn) === joinRightCol, "left")
      } else {
        underlying
          .withColumn(skewedTempColumn,
            when(joinLeftCol.isNotNull, joinLeftCol).otherwise(negativeRandomWithin(nullNumBuckets)))
          .join(right.where(joinRightCol.isNotNull), col(skewedTempColumn) === joinRightCol, "left")
      }
    }
  }