Python 我们如何使用SQL风格的“连接”连接两个Spark SQL数据帧；例如；标准_Python_Apache Spark_Apache Spark Sql_Pyspark

Python 我们如何使用SQL风格的“连接”连接两个Spark SQL数据帧；例如；标准

python apache-spark pyspark

Python 我们如何使用SQL风格的“连接”连接两个Spark SQL数据帧；例如；标准,python,apache-spark,apache-spark-sql,pyspark,Python,Apache Spark,Apache Spark Sql,Pyspark,我们正在使用PySpark库与Spark 1.3.1接口我们有两个数据帧，documents\u-df:={document\u-id，document\u-text}和keywords\u-df:={keyword}。我们希望连接这两个数据帧，并使用关键字出现在文档文档文本字符串中的条件，返回一个带有{document\u id，keyword}对的结果数据帧例如，在PostgreSQL中，我们可以使用以下形式的ON子句实现这一点： document|df.document|text我喜欢

我们正在使用PySpark库与Spark 1.3.1接口

我们有两个数据帧，

documents\u-df:={document\u-id，document\u-text}

和

keywords\u-df:={keyword}

。我们希望连接这两个数据帧，并使用关键字出现在文档文档文本字符串中的条件，返回一个带有

{document\u id，keyword}

对的结果数据帧

例如，在PostgreSQL中，我们可以使用以下形式的ON子句实现这一点：

document|df.document|text我喜欢“%”| |关键字| |‘%”

然而，在PySpark中，我无法使用任何形式的连接语法。以前有人取得过这样的成就吗

致以亲切的问候

Will

有两种不同的方法可以实现，但一般不推荐。首先让我们创建一个虚拟数据：

from pyspark.sql import Row

document_row = Row("document_id", "document_text")
keyword_row = Row("keyword") 

documents_df = sc.parallelize([
    document_row(1L, "apache spark is the best"),
    document_row(2L, "erlang rocks"),
    document_row(3L, "but haskell is better")
]).toDF()

keywords_df = sc.parallelize([
    keyword_row("erlang"),
    keyword_row("haskell"),
    keyword_row("spark")
]).toDF()

蜂巢UDF

documents_df.registerTempTable("documents")
keywords_df.registerTempTable("keywords")

query = """SELECT document_id, keyword
    FROM documents JOIN keywords
    ON document_text LIKE CONCAT('%', keyword, '%')"""

like_with_hive_udf = sqlContext.sql(query)
like_with_hive_udf.show()

## +-----------+-------+
## |document_id|keyword|
## +-----------+-------+
## |          1|  spark|
## |          2| erlang|
## |          3|haskell|
## +-----------+-------+

Python自定义项

from pyspark.sql.functions import udf, col 
from pyspark.sql.types import BooleanType

# Of you can replace `in` with a regular expression
contains = udf(lambda s, q: q in s, BooleanType())

like_with_python_udf = (documents_df.join(keywords_df)
    .where(contains(col("document_text"), col("keyword")))
    .select(col("document_id"), col("keyword")))
like_with_python_udf.show()

## +-----------+-------+
## |document_id|keyword|
## +-----------+-------+
## |          1|  spark|
## |          2| erlang|
## |          3|haskell|
## +-----------+-------+

为什么不推荐呢？因为在这两种情况下都需要笛卡尔积：

like_with_hive_udf.explain()

## TungstenProject [document_id#2L,keyword#4]
##  Filter document_text#3 LIKE concat(%,keyword#4,%)
##   CartesianProduct
##    Scan PhysicalRDD[document_id#2L,document_text#3]
##    Scan PhysicalRDD[keyword#4]

like_with_python_udf.explain()

## TungstenProject [document_id#2L,keyword#4]
##  Filter pythonUDF#13
##   !BatchPythonEvaluation PythonUDF#<lambda>(document_text#3,keyword#4), ...
##    CartesianProduct
##     Scan PhysicalRDD[document_id#2L,document_text#3]
##     Scan PhysicalRDD[keyword#4]

这需要洗牌，但不需要笛卡尔：

like_with_tokenizer.explain()

## TungstenProject [document_id#2L,keyword#4]
##  SortMergeJoin [token#29], [keyword#4]
##   TungstenSort [token#29 ASC], false, 0
##    TungstenExchange hashpartitioning(token#29)
##     TungstenProject [document_id#2L,token#29]
##      !Generate explode(words#27), true, false, [document_id#2L, ...
##       ConvertToSafe
##        TungstenProject [document_id#2L,UDF(document_text#3) AS words#27]
##         Scan PhysicalRDD[document_id#2L,document_text#3]
##   TungstenSort [keyword#4 ASC], false, 0
##    TungstenExchange hashpartitioning(keyword#4)
##     ConvertToUnsafe
##      Scan PhysicalRDD[keyword#4]

Python UDF和广播变量-如果关键字列表相对较小

from pyspark.sql.types import ArrayType, StringType

keywords = sc.broadcast(set(
    keywords_df.map(lambda row: row[0]).collect()))

bd_contains = udf(
    lambda s: list(set(s.split()) & keywords.value), 
    ArrayType(StringType()))


like_with_bd = (documents_df.select(
    col("document_id"), 
    explode(bd_contains(col("document_text"))).alias("keyword")))

like_with_bd.show()

## +-----------+-------+
## |document_id|keyword|
## +-----------+-------+
## |          1|  spark|
## |          2| erlang|
## |          3|haskell|
## +-----------+-------+

它既不需要洗牌也不需要笛卡尔，但仍然需要将广播变量传输到每个工作节点

like_with_bd.explain()

## TungstenProject [document_id#2L,keyword#46]
##  !Generate explode(pythonUDF#47), true, false, ...
##   ConvertToSafe
##    TungstenProject [document_id#2L,pythonUDF#47]
##     !BatchPythonEvaluation PythonUDF#<lambda>(document_text#3), ...
##      Scan PhysicalRDD[document_id#2L,document_text#3]

相关的：

有关近似匹配，请参阅

# +-----------+-------+-------------+
# |document_id|keyword|document_text|
# +-----------+-------+-------------+
# |          1| google|   google llc|
# |          3|  yahoo|    yahoo llc|
# +-----------+-------+-------------+

explode（）

map（）

like_with_bd=documents_df.select（col（“document_id”），bd_contains（col（“document_text”））.alias（“关键字”）.flatMap（lambda row:[（kw，row[0]）表示第[1]行中的kw）

from pyspark.sql.functions import broadcast

like_with_tokenizer_and_bd = (broadcast(tokenized)
    .join(keywords_df, col("token") == col("keyword"))
    .drop("token"))

like_with_tokenizer.explain()

## TungstenProject [document_id#3L,keyword#5]
##  BroadcastHashJoin [token#10], [keyword#5], BuildLeft
##   TungstenProject [document_id#3L,token#10]
##    !Generate explode(words#8), true, false, ...
##     ConvertToSafe
##      TungstenProject [document_id#3L,UDF(document_text#4) AS words#8]
##       Scan PhysicalRDD[document_id#3L,document_text#4]
##   ConvertToUnsafe
##    Scan PhysicalRDD[keyword#5]

from pyspark.sql.functions import udf, col
from pyspark.sql.types import BooleanType

from pyspark.sql import Row


def string_match_percentage(col_1, col_2, confidence):
    s = col_1.lower()
    t = col_2.lower()

    global row, col
    rows = len(s) + 1
    cols = len(t) + 1
    array_diffrence = np.zeros((rows, cols), dtype=int)

    for i in range(1, rows):
        for k in range(1, cols):
            array_diffrence[i][0] = i
            array_diffrence[0][k] = k

    for col in range(1, cols):
        for row in range(1, rows):
            if s[row - 1] == t[col - 1]:
                cost = 0
            else:
                cost = 2
            array_diffrence[row][col] = min(array_diffrence[row - 1][col] + 1,
                                            array_diffrence[row][col - 1] + 1,
                                            array_diffrence[row - 1][col - 1] + cost)
    match_percentage = ((len(s) + len(t)) - array_diffrence[row][col]) / (len(s) + len(t)) * 100
    if match_percentage >= confidence:
        return True
    else:
        return False


document_row = Row("document_id", "document_text")
keyword_row = Row("keyword")

documents_df = sc.parallelize([
    document_row(1, "google llc"),
    document_row(2, "blackfiled llc"),
    document_row(3, "yahoo llc")
]).toDF()

keywords_df = sc.parallelize([
    keyword_row("yahoo"),
    keyword_row("google"),
    keyword_row("apple")
]).toDF()

conditional_contains = udf(lambda s, q: string_match_percentage(s, q, confidence=70), BooleanType())

like_joined_df = (documents_df.crossJoin(keywords_df)
                        .where(conditional_contains(col("document_text"), col("keyword")))
                        .select(col("document_id"), col("keyword"), col("document_text")))
like_joined_df.show()

# +-----------+-------+-------------+
# |document_id|keyword|document_text|
# +-----------+-------+-------------+
# |          1| google|   google llc|
# |          3|  yahoo|    yahoo llc|
# +-----------+-------+-------------+