将PySpark数据帧导出到Azure Data Lake需要花费很长时间
当输入数据的大小(约6GB)很小时,下面的代码在Mac OS上的PySpark 2.4独立版本(Python3.7)上运行得非常好。然而,当我在HDInsight cluster(HDI 4.0,即Python 3.5、PySpark 2.4、4个工作节点,每个节点有64个内核和432 GB的RAM、2个头节点,每个节点有4个内核和28 GB的RAM,第二代数据湖)上使用较大的输入数据(169 GB)运行代码时,最后一步,即将数据写入数据湖,花费了很长时间(我在执行24小时后将其杀死)以完成。鉴于HDInsight在云计算社区并不流行,我只能参考一些帖子,这些帖子抱怨在将数据帧写入S3时速度太慢。一些人建议对数据集进行重新分区,我这样做了,但没有帮助将PySpark数据帧导出到Azure Data Lake需要花费很长时间,pyspark,bigdata,azure-hdinsight,Pyspark,Bigdata,Azure Hdinsight,当输入数据的大小(约6GB)很小时,下面的代码在Mac OS上的PySpark 2.4独立版本(Python3.7)上运行得非常好。然而,当我在HDInsight cluster(HDI 4.0,即Python 3.5、PySpark 2.4、4个工作节点,每个节点有64个内核和432 GB的RAM、2个头节点,每个节点有4个内核和28 GB的RAM,第二代数据湖)上使用较大的输入数据(169 GB)运行代码时,最后一步,即将数据写入数据湖,花费了很长时间(我在执行24小时后将其杀死)以完成。鉴
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
print("Creating Spark Environment...")
spark = SparkSession.builder.appName("Menu").getOrCreate()
print("Spark Environment Created!")
print("Working on Checkpoint1...")
orders = spark.read.csv(order_path)
orders.createOrReplaceTempView("orders")
orders = spark.sql(
"SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
)
orders.createOrReplaceTempView("orders")
beer = spark.read.csv(
beer_path,
header=True
)
beer.createOrReplaceTempView("beer")
beer = spark.sql(
"""
SELECT
order_id AS beer_order_id,
in_menu_id AS beer_in_menu_id,
'-999' AS beer_in_menu_name
FROM beer
"""
)
beer.createOrReplaceTempView("beer")
orders = spark.sql(
"""
WITH orders_beer AS (
SELECT *
FROM orders
LEFT JOIN beer
ON orders.in_menu_id = beer.beer_in_menu_id
)
SELECT
order_id,
in_menu_id,
CASE
WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
WHEN beer_in_menu_name IS NULL THEN in_menu_name
END AS menu_name
FROM orders_beer
"""
)
print("Checkpoint1 Completed!")
print("Working on Checkpoint2...")
corpus = spark.read.csv(
corpus_path,
header=True
)
keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
orders = orders.withColumn(
"keyword",
regexp_extract(
"menu_name",
"(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)",
0
)
)
orders.createOrReplaceTempView("orders")
orders = spark.sql("""
SELECT order_id, in_menu_id, keyword
FROM orders
WHERE keyword != ''
""")
orders.createOrReplaceTempView("orders")
orders = orders.groupBy("order_id").agg(
collect_set("keyword").alias("items")
)
print("Checkpoint2 Completed!")
print("Working on Checkpoint3...")
fpGrowth = FPGrowth(
itemsCol="items",
minSupport=0,
minConfidence=0
)
model = fpGrowth.fit(orders)
print("Checkpoint3 Completed!")
print("Working on Checkpoint4...")
frequency = model.freqItemsets
frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
frequency = frequency.withColumn(
"items",
array_remove("items", "-999")
)
frequency = frequency.filter(size(col("items")) > 0)
frequency = frequency.orderBy(asc("items"), desc("freq"))
frequency = frequency.dropDuplicates(["items"])
frequency = frequency.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(frequency.items)
)
frequency.createOrReplaceTempView("frequency")
lift = model.associationRules
lift = lift.drop("confidence")
lift = lift.filter(col("lift") > LIFT_THRESHOLD)
lift = lift.filter(
udf(
lambda x: x == ["-999"], BooleanType()
)(lift.consequent)
)
lift = lift.drop("consequent")
lift = lift.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(lift.antecedent)
)
lift.createOrReplaceTempView("lift")
result = spark.sql(
"""
SELECT lift.antecedent, freq AS frequency, lift
FROM lift
INNER JOIN frequency
ON lift.antecedent = frequency.antecedent
"""
)
print("Checkpoint4 Completed!")
print("Writing Result to Data Lake...")
result.repartition(1024).write.mode("overwrite").parquet(output_path)
print("All Done!")
def main():
work(
order_path=169.1 GB of txt,
beer_path=4.9 GB of csv,
corpus_path=210 KB of csv,
output_path="final_result.parquet"
)
if __name__ == "__main__":
main()
我最初认为这是由文件格式拼花造成的。但是,当我尝试csv时,我遇到了相同的问题。我尝试了result.count()
来查看表result
有多少行。获取行号花费了很长时间,就像将数据写入数据湖一样。
有人建议,如果一个大数据集与一个小数据集连接,则使用广播哈希连接而不是默认的排序合并连接。我认为值得一试,因为试点研究中较小的样本告诉我,频率
的行数大约是提升
的0.09%(如果您在跟踪频率
和提升
方面有困难,请参阅下面的查询)
考虑到这一点,我修改了我的代码:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
print("Creating Spark Environment...")
spark = SparkSession.builder.appName("Menu").getOrCreate()
print("Spark Environment Created!")
print("Working on Checkpoint1...")
orders = spark.read.csv(order_path)
orders.createOrReplaceTempView("orders")
orders = spark.sql(
"SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
)
orders.createOrReplaceTempView("orders")
beer = spark.read.csv(
beer_path,
header=True
)
beer.createOrReplaceTempView("beer")
beer = spark.sql(
"""
SELECT
order_id AS beer_order_id,
in_menu_id AS beer_in_menu_id,
'-999' AS beer_in_menu_name
FROM beer
"""
)
beer.createOrReplaceTempView("beer")
orders = spark.sql(
"""
WITH orders_beer AS (
SELECT *
FROM orders
LEFT JOIN beer
ON orders.in_menu_id = beer.beer_in_menu_id
)
SELECT
order_id,
in_menu_id,
CASE
WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
WHEN beer_in_menu_name IS NULL THEN in_menu_name
END AS menu_name
FROM orders_beer
"""
)
print("Checkpoint1 Completed!")
print("Working on Checkpoint2...")
corpus = spark.read.csv(
corpus_path,
header=True
)
keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
orders = orders.withColumn(
"keyword",
regexp_extract(
"menu_name",
"(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)",
0
)
)
orders.createOrReplaceTempView("orders")
orders = spark.sql("""
SELECT order_id, in_menu_id, keyword
FROM orders
WHERE keyword != ''
""")
orders.createOrReplaceTempView("orders")
orders = orders.groupBy("order_id").agg(
collect_set("keyword").alias("items")
)
print("Checkpoint2 Completed!")
print("Working on Checkpoint3...")
fpGrowth = FPGrowth(
itemsCol="items",
minSupport=0,
minConfidence=0
)
model = fpGrowth.fit(orders)
print("Checkpoint3 Completed!")
print("Working on Checkpoint4...")
frequency = model.freqItemsets
frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
frequency = frequency.withColumn(
"antecedent",
array_remove("items", "-999")
)
frequency = frequency.drop("items")
frequency = frequency.filter(size(col("antecedent")) > 0)
frequency = frequency.orderBy(asc("antecedent"), desc("freq"))
frequency = frequency.dropDuplicates(["antecedent"])
frequency = frequency.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(frequency.antecedent)
)
lift = model.associationRules
lift = lift.drop("confidence")
lift = lift.filter(col("lift") > LIFT_THRESHOLD)
lift = lift.filter(
udf(
lambda x: x == ["-999"], BooleanType()
)(lift.consequent)
)
lift = lift.drop("consequent")
lift = lift.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(lift.antecedent)
)
result = lift.join(
frequency.hint("broadcast"),
["antecedent"],
"inner"
)
print("Checkpoint4 Completed!")
print("Writing Result to Data Lake...")
result.repartition(1024).write.mode("overwrite").parquet(output_path)
print("All Done!")
def main():
work(
order_path=169.1 GB of txt,
beer_path=4.9 GB of csv,
corpus_path=210 KB of csv,
output_path="final_result.parquet"
)
if __name__ == "__main__":
main()
这段代码在我的Mac OS上使用相同的样本数据时运行得非常好,正如预期的那样,所花费的时间更短(34秒比26秒)。然后我决定用完整的数据集运行HDInsight的代码。在最后一步,即向数据湖写入数据时,任务失败,我被告知由于SparkContext被关闭而取消了作业。。我对大数据相当陌生,不知道这种方法。互联网上的帖子说,这背后可能有很多原因。无论我应该使用何种方法,如何优化我的代码,以便在可承受的时间内在数据池中获得所需的输出?我会尝试几种方法,按它们所需的能量大小排序:
- 检查ADL存储是否与HDInsight群集位于同一区域
- 在繁重的计算之后添加对df=df.cache()的调用,甚至在这些计算之间向缓存存储器写入数据帧,然后从缓存存储器读取数据帧
- 将您的UDF替换为“本机”Spark代码,因为UDF是其中之一
- 检查ADL存储是否与HDInsight群集位于同一区域
- 在繁重的计算之后添加对df=df.cache()的调用,甚至在这些计算之间向缓存存储器写入数据帧,然后从缓存存储器读取数据帧
- 将您的UDF替换为“本机”Spark代码,因为UDF是其中之一
- 正如下面David Taub指出的,在大量计算之后或将数据提供给模型之前,使用
。我使用了df.cache()
,因为单独调用df.cache().count()
会延迟计算,但下面的.cache()
会强制计算整个数据集.count()
- 使用而不是正则表达式来提取关键字。这大大提高了代码性能
- 小心连接/合并。由于数据倾斜,它可能会变得非常慢。请始终考虑避免不必要的连接的方法
- 为
设置FPGrowth
。这将显著缩短调用minSupport
的时间model.frequeitemsets
- 正如下面David Taub指出的,在大量计算之后或将数据提供给模型之前,使用
。我使用了df.cache()
,因为单独调用df.cache().count()
会延迟计算,但下面的.cache()
会强制计算整个数据集.count()
- 使用而不是正则表达式来提取关键字。这大大提高了代码性能
- 小心连接/合并。由于数据倾斜,它可能会变得非常慢。请始终考虑避免不必要的连接的方法
- 为
设置FPGrowth
。这将显著缩短调用minSupport
的时间model.frequeitemsets
- 经过五天的努力,我终于找到了答案。以下是我优化代码的方法。代码执行时间从24小时以上下降到10分钟左右。代码优化非常重要
经过五天的努力,我终于找到了答案。以下是我优化代码的方法。代码执行时间从24小时以上下降到10分钟左右。代码优化非常重要
感谢您的快速回复。我将逐一检查它们。创建群集时的存储帐户与我的输出路径的存储帐户相同。但是,它们不共享同一个blob容器。这是否会影响性能?帐户和区域不一样。是的,缓存确实帮助了我!谢谢。谢谢您的提问UIK回复。我将逐一检查它们。创建群集时的存储帐户与我的输出路径的存储帐户相同。但是,它们不共享同一个blob容器。这是否会影响性能?帐户和区域不一样。是的,缓存确实帮助了我!谢谢。
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
print("Creating Spark Environment...")
spark = SparkSession.builder.appName("Menu").getOrCreate()
print("Spark Environment Created!")
print("Working on Checkpoint1...")
orders = spark.read.csv(order_path)
orders.createOrReplaceTempView("orders")
orders = spark.sql(
"SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
)
orders.createOrReplaceTempView("orders")
beer = spark.read.csv(
beer_path,
header=True
)
beer.createOrReplaceTempView("beer")
beer = spark.sql(
"""
SELECT
order_id AS beer_order_id,
in_menu_id AS beer_in_menu_id,
'-999' AS beer_in_menu_name
FROM beer
"""
)
beer.createOrReplaceTempView("beer")
orders = spark.sql(
"""
WITH orders_beer AS (
SELECT *
FROM orders
LEFT JOIN beer
ON orders.in_menu_id = beer.beer_in_menu_id
)
SELECT
order_id,
in_menu_id,
CASE
WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
WHEN beer_in_menu_name IS NULL THEN in_menu_name
END AS menu_name
FROM orders_beer
"""
)
print("Checkpoint1 Completed!")
print("Working on Checkpoint2...")
corpus = spark.read.csv(
corpus_path,
header=True
)
keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
orders = orders.withColumn(
"keyword",
regexp_extract(
"menu_name",
"(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)",
0
)
)
orders.createOrReplaceTempView("orders")
orders = spark.sql("""
SELECT order_id, in_menu_id, keyword
FROM orders
WHERE keyword != ''
""")
orders.createOrReplaceTempView("orders")
orders = orders.groupBy("order_id").agg(
collect_set("keyword").alias("items")
)
print("Checkpoint2 Completed!")
print("Working on Checkpoint3...")
fpGrowth = FPGrowth(
itemsCol="items",
minSupport=0,
minConfidence=0
)
model = fpGrowth.fit(orders)
print("Checkpoint3 Completed!")
print("Working on Checkpoint4...")
frequency = model.freqItemsets
frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
frequency = frequency.withColumn(
"antecedent",
array_remove("items", "-999")
)
frequency = frequency.drop("items")
frequency = frequency.filter(size(col("antecedent")) > 0)
frequency = frequency.orderBy(asc("antecedent"), desc("freq"))
frequency = frequency.dropDuplicates(["antecedent"])
frequency = frequency.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(frequency.antecedent)
)
lift = model.associationRules
lift = lift.drop("confidence")
lift = lift.filter(col("lift") > LIFT_THRESHOLD)
lift = lift.filter(
udf(
lambda x: x == ["-999"], BooleanType()
)(lift.consequent)
)
lift = lift.drop("consequent")
lift = lift.withColumn(
"antecedent",
udf(
lambda x: "|".join(sorted(x)), StringType()
)(lift.antecedent)
)
result = lift.join(
frequency.hint("broadcast"),
["antecedent"],
"inner"
)
print("Checkpoint4 Completed!")
print("Writing Result to Data Lake...")
result.repartition(1024).write.mode("overwrite").parquet(output_path)
print("All Done!")
def main():
work(
order_path=169.1 GB of txt,
beer_path=4.9 GB of csv,
corpus_path=210 KB of csv,
output_path="final_result.parquet"
)
if __name__ == "__main__":
main()