Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/visual-studio-2012/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
将PySpark数据帧导出到Azure Data Lake需要花费很长时间_Pyspark_Bigdata_Azure Hdinsight - Fatal编程技术网

将PySpark数据帧导出到Azure Data Lake需要花费很长时间

将PySpark数据帧导出到Azure Data Lake需要花费很长时间,pyspark,bigdata,azure-hdinsight,Pyspark,Bigdata,Azure Hdinsight,当输入数据的大小(约6GB)很小时,下面的代码在Mac OS上的PySpark 2.4独立版本(Python3.7)上运行得非常好。然而,当我在HDInsight cluster(HDI 4.0,即Python 3.5、PySpark 2.4、4个工作节点,每个节点有64个内核和432 GB的RAM、2个头节点,每个节点有4个内核和28 GB的RAM,第二代数据湖)上使用较大的输入数据(169 GB)运行代码时,最后一步,即将数据写入数据湖,花费了很长时间(我在执行24小时后将其杀死)以完成。鉴

当输入数据的大小(约6GB)很小时,下面的代码在Mac OS上的PySpark 2.4独立版本(Python3.7)上运行得非常好。然而,当我在HDInsight cluster(HDI 4.0,即Python 3.5、PySpark 2.4、4个工作节点,每个节点有64个内核和432 GB的RAM、2个头节点,每个节点有4个内核和28 GB的RAM,第二代数据湖)上使用较大的输入数据(169 GB)运行代码时,最后一步,即将数据写入数据湖,花费了很长时间(我在执行24小时后将其杀死)以完成。鉴于HDInsight在云计算社区并不流行,我只能参考一些帖子,这些帖子抱怨在将数据帧写入S3时速度太慢。一些人建议对数据集进行重新分区,我这样做了,但没有帮助

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"

def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
    print("Creating Spark Environment...")
    spark = SparkSession.builder.appName("Menu").getOrCreate()
    print("Spark Environment Created!")
    print("Working on Checkpoint1...")
    orders = spark.read.csv(order_path)
    orders.createOrReplaceTempView("orders")
    orders = spark.sql(
        "SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
    )
    orders.createOrReplaceTempView("orders")
    beer = spark.read.csv(
        beer_path,
        header=True
    )
    beer.createOrReplaceTempView("beer")
    beer = spark.sql(
        """
        SELECT 
            order_id AS beer_order_id,
            in_menu_id AS beer_in_menu_id,
            '-999' AS beer_in_menu_name
        FROM beer
        """
    )
    beer.createOrReplaceTempView("beer")
    orders = spark.sql(
        """
        WITH orders_beer AS (
            SELECT *
            FROM orders
            LEFT JOIN beer
            ON orders.in_menu_id = beer.beer_in_menu_id
        )
        SELECT
            order_id,
            in_menu_id,
            CASE
                WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
                WHEN beer_in_menu_name IS NULL THEN in_menu_name
            END AS menu_name
        FROM orders_beer
        """
    )
    print("Checkpoint1 Completed!")
    print("Working on Checkpoint2...")
    corpus = spark.read.csv(
        corpus_path,
        header=True
    )
    keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
    orders = orders.withColumn(
        "keyword", 
        regexp_extract(
            "menu_name", 
            "(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)", 
            0
        )
    )
    orders.createOrReplaceTempView("orders")
    orders = spark.sql("""
        SELECT order_id, in_menu_id, keyword
        FROM orders
        WHERE keyword != ''
    """)
    orders.createOrReplaceTempView("orders")
    orders = orders.groupBy("order_id").agg(
        collect_set("keyword").alias("items")
    )
    print("Checkpoint2 Completed!")
    print("Working on Checkpoint3...")
    fpGrowth = FPGrowth(
        itemsCol="items", 
        minSupport=0, 
        minConfidence=0
    )
    model = fpGrowth.fit(orders)
    print("Checkpoint3 Completed!")
    print("Working on Checkpoint4...")
    frequency = model.freqItemsets
    frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
    frequency = frequency.withColumn(
        "items", 
        array_remove("items", "-999")
    )
    frequency = frequency.filter(size(col("items")) > 0)
    frequency = frequency.orderBy(asc("items"), desc("freq"))
    frequency = frequency.dropDuplicates(["items"])
    frequency = frequency.withColumn(
        "antecedent", 
        udf(
            lambda x: "|".join(sorted(x)), StringType()
        )(frequency.items)
    )
    frequency.createOrReplaceTempView("frequency")
    lift = model.associationRules
    lift = lift.drop("confidence")
    lift = lift.filter(col("lift") > LIFT_THRESHOLD)
    lift = lift.filter(
        udf(
            lambda x: x == ["-999"], BooleanType()
        )(lift.consequent)
    )
    lift = lift.drop("consequent")
    lift = lift.withColumn(
        "antecedent", 
        udf(
            lambda x: "|".join(sorted(x)), StringType()
        )(lift.antecedent)
    )
    lift.createOrReplaceTempView("lift")
    result = spark.sql(
        """
        SELECT lift.antecedent, freq AS frequency, lift
        FROM lift
        INNER JOIN frequency
        ON lift.antecedent = frequency.antecedent
        """
    )
    print("Checkpoint4 Completed!")
    print("Writing Result to Data Lake...")
    result.repartition(1024).write.mode("overwrite").parquet(output_path)
    print("All Done!")

def main():
    work(
        order_path=169.1 GB of txt,
        beer_path=4.9 GB of csv,
        corpus_path=210 KB of csv,
        output_path="final_result.parquet"
    )

if __name__ == "__main__":
    main()
我最初认为这是由文件格式拼花造成的。但是,当我尝试csv时,我遇到了相同的问题。我尝试了
result.count()
来查看表
result
有多少行。获取行号花费了很长时间,就像将数据写入数据湖一样。 有人建议,如果一个大数据集与一个小数据集连接,则使用广播哈希连接而不是默认的排序合并连接。我认为值得一试,因为试点研究中较小的样本告诉我,
频率
的行数大约是
提升
的0.09%(如果您在跟踪
频率
提升
方面有困难,请参阅下面的查询)

考虑到这一点,我修改了我的代码:

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
from pyspark.ml.fpm import FPGrowth
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"

def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
    print("Creating Spark Environment...")
    spark = SparkSession.builder.appName("Menu").getOrCreate()
    print("Spark Environment Created!")
    print("Working on Checkpoint1...")
    orders = spark.read.csv(order_path)
    orders.createOrReplaceTempView("orders")
    orders = spark.sql(
        "SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
    )
    orders.createOrReplaceTempView("orders")
    beer = spark.read.csv(
        beer_path,
        header=True
    )
    beer.createOrReplaceTempView("beer")
    beer = spark.sql(
        """
        SELECT 
            order_id AS beer_order_id,
            in_menu_id AS beer_in_menu_id,
            '-999' AS beer_in_menu_name
        FROM beer
        """
    )
    beer.createOrReplaceTempView("beer")
    orders = spark.sql(
        """
        WITH orders_beer AS (
            SELECT *
            FROM orders
            LEFT JOIN beer
            ON orders.in_menu_id = beer.beer_in_menu_id
        )
        SELECT
            order_id,
            in_menu_id,
            CASE
                WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
                WHEN beer_in_menu_name IS NULL THEN in_menu_name
            END AS menu_name
        FROM orders_beer
        """
    )
    print("Checkpoint1 Completed!")
    print("Working on Checkpoint2...")
    corpus = spark.read.csv(
        corpus_path,
        header=True
    )
    keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
    orders = orders.withColumn(
        "keyword", 
        regexp_extract(
            "menu_name", 
            "(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)", 
            0
        )
    )
    orders.createOrReplaceTempView("orders")
    orders = spark.sql("""
        SELECT order_id, in_menu_id, keyword
        FROM orders
        WHERE keyword != ''
    """)
    orders.createOrReplaceTempView("orders")
    orders = orders.groupBy("order_id").agg(
        collect_set("keyword").alias("items")
    )
    print("Checkpoint2 Completed!")
    print("Working on Checkpoint3...")
    fpGrowth = FPGrowth(
        itemsCol="items", 
        minSupport=0, 
        minConfidence=0
    )
    model = fpGrowth.fit(orders)
    print("Checkpoint3 Completed!")
    print("Working on Checkpoint4...")
    frequency = model.freqItemsets
    frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
    frequency = frequency.withColumn(
        "antecedent", 
        array_remove("items", "-999")
    )
    frequency = frequency.drop("items")
    frequency = frequency.filter(size(col("antecedent")) > 0)
    frequency = frequency.orderBy(asc("antecedent"), desc("freq"))
    frequency = frequency.dropDuplicates(["antecedent"])
    frequency = frequency.withColumn(
        "antecedent", 
        udf(
            lambda x: "|".join(sorted(x)), StringType()
        )(frequency.antecedent)
    )
    lift = model.associationRules
    lift = lift.drop("confidence")
    lift = lift.filter(col("lift") > LIFT_THRESHOLD)
    lift = lift.filter(
        udf(
            lambda x: x == ["-999"], BooleanType()
        )(lift.consequent)
    )
    lift = lift.drop("consequent")
    lift = lift.withColumn(
        "antecedent", 
        udf(
            lambda x: "|".join(sorted(x)), StringType()
        )(lift.antecedent)
    )
    result = lift.join(
        frequency.hint("broadcast"), 
        ["antecedent"], 
        "inner"
    )
    print("Checkpoint4 Completed!")
    print("Writing Result to Data Lake...")
    result.repartition(1024).write.mode("overwrite").parquet(output_path)
    print("All Done!")

def main():
    work(
        order_path=169.1 GB of txt,
        beer_path=4.9 GB of csv,
        corpus_path=210 KB of csv,
        output_path="final_result.parquet"
    )

if __name__ == "__main__":
    main()

这段代码在我的Mac OS上使用相同的样本数据时运行得非常好,正如预期的那样,所花费的时间更短(34秒比26秒)。然后我决定用完整的数据集运行HDInsight的代码。在最后一步,即向数据湖写入数据时,任务失败,我被告知由于SparkContext被关闭而取消了作业。。我对大数据相当陌生,不知道这种方法。互联网上的帖子说,这背后可能有很多原因。无论我应该使用何种方法,如何优化我的代码,以便在可承受的时间内在数据池中获得所需的输出?

我会尝试几种方法,按它们所需的能量大小排序:

  • 检查ADL存储是否与HDInsight群集位于同一区域
  • 在繁重的计算之后添加对df=df.cache()的调用,甚至在这些计算之间向缓存存储器写入数据帧,然后从缓存存储器读取数据帧
  • 将您的UDF替换为“本机”Spark代码,因为UDF是其中之一

我会尝试几种方法,按照它们所需的能量大小排序:

  • 检查ADL存储是否与HDInsight群集位于同一区域
  • 在繁重的计算之后添加对df=df.cache()的调用,甚至在这些计算之间向缓存存储器写入数据帧,然后从缓存存储器读取数据帧
  • 将您的UDF替换为“本机”Spark代码,因为UDF是其中之一

    • 经过五天的努力,我终于找到了答案。以下是我优化代码的方法。代码执行时间从24小时以上下降到10分钟左右。代码优化非常重要

    • 正如下面David Taub指出的,在大量计算之后或将数据提供给模型之前,使用
      df.cache()
      。我使用了
      df.cache().count()
      ,因为单独调用
      .cache()
      会延迟计算,但下面的
      .count()
      会强制计算整个数据集
    • 使用而不是正则表达式来提取关键字。这大大提高了代码性能
    • 小心连接/合并。由于数据倾斜,它可能会变得非常慢。请始终考虑避免不必要的连接的方法
    • FPGrowth
      设置
      minSupport
      。这将显著缩短调用
      model.frequeitemsets
      的时间

    • 经过五天的努力,我终于找到了答案。以下是我优化代码的方法。代码执行时间从24小时以上下降到10分钟左右。代码优化非常重要

    • 正如下面David Taub指出的,在大量计算之后或将数据提供给模型之前,使用
      df.cache()
      。我使用了
      df.cache().count()
      ,因为单独调用
      .cache()
      会延迟计算,但下面的
      .count()
      会强制计算整个数据集
    • 使用而不是正则表达式来提取关键字。这大大提高了代码性能
    • 小心连接/合并。由于数据倾斜,它可能会变得非常慢。请始终考虑避免不必要的连接的方法
    • FPGrowth
      设置
      minSupport
      。这将显著缩短调用
      model.frequeitemsets
      的时间

    • 感谢您的快速回复。我将逐一检查它们。创建群集时的存储帐户与我的输出路径的存储帐户相同。但是,它们不共享同一个blob容器。这是否会影响性能?帐户和区域不一样。是的,缓存确实帮助了我!谢谢。谢谢您的提问UIK回复。我将逐一检查它们。创建群集时的存储帐户与我的输出路径的存储帐户相同。但是,它们不共享同一个blob容器。这是否会影响性能?帐户和区域不一样。是的,缓存确实帮助了我!谢谢。
      from pyspark.sql import SparkSession
      from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType
      from pyspark.sql.functions import udf, regexp_extract, collect_set, array_remove, col, size, asc, desc
      from pyspark.ml.fpm import FPGrowth
      import os
      os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.5"
      os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.5"
      
      def work(order_path, beer_path, corpus_path, output_path, FREQ_THRESHOLD=1000, LIFT_THRESHOLD=1):
          print("Creating Spark Environment...")
          spark = SparkSession.builder.appName("Menu").getOrCreate()
          print("Spark Environment Created!")
          print("Working on Checkpoint1...")
          orders = spark.read.csv(order_path)
          orders.createOrReplaceTempView("orders")
          orders = spark.sql(
              "SELECT _c14 AS order_id, _c31 AS in_menu_id, _c32 AS in_menu_name FROM orders"
          )
          orders.createOrReplaceTempView("orders")
          beer = spark.read.csv(
              beer_path,
              header=True
          )
          beer.createOrReplaceTempView("beer")
          beer = spark.sql(
              """
              SELECT 
                  order_id AS beer_order_id,
                  in_menu_id AS beer_in_menu_id,
                  '-999' AS beer_in_menu_name
              FROM beer
              """
          )
          beer.createOrReplaceTempView("beer")
          orders = spark.sql(
              """
              WITH orders_beer AS (
                  SELECT *
                  FROM orders
                  LEFT JOIN beer
                  ON orders.in_menu_id = beer.beer_in_menu_id
              )
              SELECT
                  order_id,
                  in_menu_id,
                  CASE
                      WHEN beer_in_menu_name IS NOT NULL THEN beer_in_menu_name
                      WHEN beer_in_menu_name IS NULL THEN in_menu_name
                  END AS menu_name
              FROM orders_beer
              """
          )
          print("Checkpoint1 Completed!")
          print("Working on Checkpoint2...")
          corpus = spark.read.csv(
              corpus_path,
              header=True
          )
          keywords = corpus.select("Food_Name").rdd.flatMap(lambda x: x).collect()
          orders = orders.withColumn(
              "keyword", 
              regexp_extract(
                  "menu_name", 
                  "(?=^|\s)(" + "|".join(keywords) + ")(?=\s|$)", 
                  0
              )
          )
          orders.createOrReplaceTempView("orders")
          orders = spark.sql("""
              SELECT order_id, in_menu_id, keyword
              FROM orders
              WHERE keyword != ''
          """)
          orders.createOrReplaceTempView("orders")
          orders = orders.groupBy("order_id").agg(
              collect_set("keyword").alias("items")
          )
          print("Checkpoint2 Completed!")
          print("Working on Checkpoint3...")
          fpGrowth = FPGrowth(
              itemsCol="items", 
              minSupport=0, 
              minConfidence=0
          )
          model = fpGrowth.fit(orders)
          print("Checkpoint3 Completed!")
          print("Working on Checkpoint4...")
          frequency = model.freqItemsets
          frequency = frequency.filter(col("freq") > FREQ_THRESHOLD)
          frequency = frequency.withColumn(
              "antecedent", 
              array_remove("items", "-999")
          )
          frequency = frequency.drop("items")
          frequency = frequency.filter(size(col("antecedent")) > 0)
          frequency = frequency.orderBy(asc("antecedent"), desc("freq"))
          frequency = frequency.dropDuplicates(["antecedent"])
          frequency = frequency.withColumn(
              "antecedent", 
              udf(
                  lambda x: "|".join(sorted(x)), StringType()
              )(frequency.antecedent)
          )
          lift = model.associationRules
          lift = lift.drop("confidence")
          lift = lift.filter(col("lift") > LIFT_THRESHOLD)
          lift = lift.filter(
              udf(
                  lambda x: x == ["-999"], BooleanType()
              )(lift.consequent)
          )
          lift = lift.drop("consequent")
          lift = lift.withColumn(
              "antecedent", 
              udf(
                  lambda x: "|".join(sorted(x)), StringType()
              )(lift.antecedent)
          )
          result = lift.join(
              frequency.hint("broadcast"), 
              ["antecedent"], 
              "inner"
          )
          print("Checkpoint4 Completed!")
          print("Writing Result to Data Lake...")
          result.repartition(1024).write.mode("overwrite").parquet(output_path)
          print("All Done!")
      
      def main():
          work(
              order_path=169.1 GB of txt,
              beer_path=4.9 GB of csv,
              corpus_path=210 KB of csv,
              output_path="final_result.parquet"
          )
      
      if __name__ == "__main__":
          main()