Apache spark pyspark使用spark.sql.rdd.foreach()修改类属性

Apache spark pyspark使用spark.sql.rdd.foreach()修改类属性,apache-spark,pyspark,Apache Spark,Pyspark,主要任务是连接配置单元并使用spark rdd读取数据 我已经尝试了下面的代码。连接和读取都成功,但是当我想修改self.jobUserProfile的值时,我失败了。然后我在三个位置打印这个值,分别在1、2和3中。在第一个位置,该值有效,但在第二和第三个位置,dict为空。似乎修改没有分配到class属性中 我已经尝试过response=spark.sql'select userid,logtime from hive.dwd_log_login_I_d limit 10'。收集并迭代此数据帧

主要任务是连接配置单元并使用spark rdd读取数据

我已经尝试了下面的代码。连接和读取都成功,但是当我想修改self.jobUserProfile的值时,我失败了。然后我在三个位置打印这个值,分别在1、2和3中。在第一个位置,该值有效,但在第二和第三个位置,dict为空。似乎修改没有分配到class属性中

我已经尝试过response=spark.sql'select userid,logtime from hive.dwd_log_login_I_d limit 10'。收集并迭代此数据帧,但当数据量过大时,性能可能会下降

当我将response.rdd.foreachlambda x:self.readLoginFunctionx更改为response.rdd.maplambda x:self.readLoginFunctionx时,三个位置的目标值都为空

我是spark的新手。任何建议都会有帮助。提前谢谢

from analysis.common.db.hive.connectHive import *
import collections

class OperateHive():
    def __init__(self):
        self.jobUserProfile = collections.defaultdict(dict)

    def readLoginFunction(self, e):
        dic = collections.defaultdict()
        dic['userid'] = e[0]
        dic['logtime'] = e[1]
        self.jobUserProfile[e[0]] = dic
        print(self.jobUserProfile)  #1

    def readLogin(self, spark):
        response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
        response.rdd.foreach(lambda x: self.readLoginFunction(x))
        print(self.jobUserProfile)  #2

if __name__ == '__main__':
    spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
    operateHive = OperateHive()
    operateHive.readLogin(spark)
    print(operateHive.jobUserProfile) #3

最后,下面的代码起作用了

from analysis.common.db.hive.connectHive import *
import collections

class OperateHive():
    def readLoginFunction(self, e,jobUserProfile, devAppProfile):
        dic = collections.defaultdict()
        dic['userid'] = e[0]
        dic['logtime'] = e[1]
        jobUserProfile[e[0]] = dic
        devAppProfile[e[0]] = dic
        print(jobUserProfile)
        return jobUserProfile, devAppProfile

    def readLogin(self, spark, jobUserProfile,devAppProfile):
        response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
        rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
        return rdd1.top(1)[0][0]

if __name__ == '__main__':
    spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
    jobUserProfile = collections.defaultdict(dict)
    devAppProfile = collections.defaultdict(dict)
    operateHive = OperateHive()
    jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
    print(jobUserProfile)
但当我删除devAppProfile时,代码如下所示:

from analysis.common.db.hive.connectHive import *
import collections

class OperateHive():
    def readLoginFunction(self, e,jobUserProfile, devAppProfile):
        dic = collections.defaultdict()
        dic['userid'] = e[0]
        dic['logtime'] = e[1]
        jobUserProfile[e[0]] = dic
        devAppProfile[e[0]] = dic
        print(jobUserProfile)
        return jobUserProfile

    def readLogin(self, spark, jobUserProfile,devAppProfile):
        response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
        response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))


if __name__ == '__main__':
    spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
    jobUserProfile = collections.defaultdict(dict)
    devAppProfile = collections.defaultdict(dict)
    operateHive = OperateHive()
    operateHive.readLogin(spark, jobUserProfile, devAppProfile)
由于printjobUserProfile中没有打印,rdd.map无法工作

然后我修改了下面的代码,它再次起作用

from analysis.common.db.hive.connectHive import *
import collections

class OperateHive():
    def readLoginFunction(self, e,jobUserProfile, devAppProfile):
        dic = collections.defaultdict()
        dic['userid'] = e[0]
        dic['logtime'] = e[1]
        jobUserProfile[e[0]] = dic
        devAppProfile[e[0]] = dic
        print(jobUserProfile)
        return jobUserProfile

    def readLogin(self, spark, jobUserProfile,devAppProfile):
        response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
        rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
        return rdd1.collect()[-1]

if __name__ == '__main__':
    spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
    jobUserProfile = collections.defaultdict(dict)
    devAppProfile = collections.defaultdict(dict)
    operateHive = OperateHive()
    jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
    print(jobUserProfile)
这篇文章的问题是关于结束的。但我不明白为什么答案上的三个版本的工作方式不同