Pyspark 如何在spark udf中使用sqlContext

Pyspark 如何在spark udf中使用sqlContext,pyspark,user-defined-functions,Pyspark,User Defined Functions,我试图在spark UDF中加载一个json文件,并使用它来查询一些东西。我需要做的是使用数据框中的列值(storeId)并在中使用它 但是,我犯了一个小错误。如果我尝试在没有sqlContext的情况下编码,那么它就可以工作 是否有任何解决方法或这是不可能的 def get_id_udf (storeId,sqlContext): df = sqlContext.read.json("file_url_s3") if storeId == None: return

我试图在spark UDF中加载一个json文件,并使用它来查询一些东西。我需要做的是使用数据框中的列值(storeId)并在中使用它

但是,我犯了一个小错误。如果我尝试在没有sqlContext的情况下编码,那么它就可以工作

是否有任何解决方法或这是不可能的

def get_id_udf (storeId,sqlContext):

   df = sqlContext.read.json("file_url_s3")

   if storeId == None:
       return None

   return None

from pyspark.sql.functions import udf, col

from pyspark.sql.types import IntegerType

desc_udf = udf(lambda storeId : get_id_udf(storeId,sqlContext),       IntegerType())
---------------------------------------------------------------------------
TypeError回溯(最近一次调用上次)
在()
1从pyspark.sql.functions导入udf,col
2从pyspark.sql.types导入IntegerType
---->3 desc_udf=udf(lambda storeId:get_cluster_id_udf(storeId,sqlContext),IntegerType())
/udf中的usr/lib/spark/python/pyspark/sql/functions.py(f,returnType)
1799[世界其他地区(slen=5),世界其他地区(slen=3)]
1800     """
->1801返回用户定义函数(f,返回类型)
1802
1803黑名单=['map','since','ignore\u unicode\u prefix']
/usr/lib/spark/python/pyspark/sql/functions.py在u_init__;中(self,func,returnType,name)
1758 self.returnType=返回类型
1759自广播=无
->1760 self.\u judf=self.\u create\u judf(名称)
1761
1762 def_create_judf(自我,姓名):
/usr/lib/spark/python/pyspark/sql/functions.py in_create_judf(self,name)
1763从pyspark.sql导入SQLContext
1764 sc=SparkContext.getOrCreate()
->1765 wrapped_func=_wrapp_函数(sc,self.func,self.returnType)
1766 ctx=SQLContext.getOrCreate(sc)
1767 jdt=ctx.\u ssql\u ctx.parseDataType(self.returnType.json())
/函数中的usr/lib/spark/python/pyspark/sql/functions.py(sc、func、returnType)
1743定义包裹函数(sc、func、returnType):
1744命令=(func,returnType)
->1745 pickled_命令,broadcast_vars,env,includes=_prepare_for_python_RDD(sc,命令)
1746返回sc._jvm.PythonFunction(bytearray(pickled_命令),env,包括,
pythonExec资深大律师, 1747 sc.pythonVer,广播公司(弗吉尼亚州)

/usr/lib/spark/python/pyspark/rdd.py在为python准备rdd(sc,命令)中
2313#序列化命令将通过广播进行压缩
2314 ser=CloudPickleSerializer()
->2315 pickled_命令=序列转储(命令)
2316如果len(pickled_命令)>(1428返回cloudpickle.dumps(obj,2)
429
430
/转储中的usr/lib/spark/python/pyspark/cloudpickle.py(obj,协议)
655
656 cp=CloudPickler(文件、协议)
-->657 cp.dump(obj)
658
659返回文件.getvalue()
/转储中的usr/lib/spark/python/pyspark/cloudpickle.py(self,obj)
105 self.injection_addons()
106尝试:
-->107回料酸洗机卸料(自、obj)
108运行时错误除外,如e:
109如果e.args[0]中的“递归”:
/转储中的usr/lib64/python2.7/pickle.pyc(self,obj)
222如果self.proto>=2:
223自写(PROTO+chr(self.PROTO))
-->224自我保存(obj)
225自写(停止)
226
/保存中的usr/lib64/python2.7/pickle.pyc(self,obj)
284 f=自调度获取(t)
285如果f:
-->286 f(self,obj)#用显式self调用未绑定方法
287返回
288
/保存目录中的usr/lib64/python2.7/pickle.pyc(self,obj)
653
654自我记忆(obj)
-->655自身批处理集合项(obj.iteritems())
656
657分派[DictionaryType]=保存dict
/usr/lib64/python2.7/pickle.pyc in_batch_setitems(self,items)
685对于tmp中的k,v:
686保存(k)
-->687保存(v)
688写入(设置项)
689 elif n:
/保存中的usr/lib64/python2.7/pickle.pyc(self,obj)
304 reduce=getattr(obj,“\uuuuu reduce\u ex\uuuuuu”,无)
305如果减少:
-->306 rv=减少(自编程)
307其他:
308 reduce=getattr(obj,“\uuuuuuuu reduce”,无)
TypeError:“JavaPackage”对象不可调用

您不能在
UDF
中使用spark上下文(在每个worker上),您只能从驱动程序中调用它。您希望如何处理此json文件?我试图使用sqlContext.read.json()读取json文件,并使用sqlContext.sql(..)从中查询内容。但是,我确实找到了一种方法,将jsonfile展平成一个数组和循环来进行查询。这似乎效率很低。因此,我真的想看看是否有一种spark方法可以做到这一点。但是,从您的评论来看,UDF似乎需要在每个工作人员身上,因此不可能有spark内容。是这样吗?为什么你需要加载同一个文件,因为你的数据框中有很多行?我考虑过了。我想我会坚持在驱动程序中读取数组/映射中的数据,并将映射传递给udf。我认为这是唯一能更好地工作的方法
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-b5c4070c110e> in <module>()
      1 from pyspark.sql.functions import udf, col
      2 from pyspark.sql.types import IntegerType
----> 3 desc_udf = udf(lambda storeId : get_cluster_id_udf(storeId,sqlContext), IntegerType())

/usr/lib/spark/python/pyspark/sql/functions.py in udf(f, returnType)
   1799     [Row(slen=5), Row(slen=3)]
   1800     """
-> 1801     return UserDefinedFunction(f, returnType)
   1802 
   1803 blacklist = ['map', 'since', 'ignore_unicode_prefix']

/usr/lib/spark/python/pyspark/sql/functions.py in __init__(self, func, returnType, name)
   1758         self.returnType = returnType
   1759         self._broadcast = None
-> 1760         self._judf = self._create_judf(name)
   1761 
   1762     def _create_judf(self, name):

/usr/lib/spark/python/pyspark/sql/functions.py in _create_judf(self, name)
   1763         from pyspark.sql import SQLContext
   1764         sc = SparkContext.getOrCreate()
-> 1765         wrapped_func = _wrap_function(sc, self.func, self.returnType)
   1766         ctx = SQLContext.getOrCreate(sc)
   1767         jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())

/usr/lib/spark/python/pyspark/sql/functions.py in _wrap_function(sc, func, returnType)
   1743 def _wrap_function(sc, func, returnType):
   1744     command = (func, returnType)
-> 1745     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
   1746     return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
/usr/lib/spark/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
   2313     # the serialized command will be compressed by broadcast
   2314     ser = CloudPickleSerializer()
-> 2315     pickled_command = ser.dumps(command)
   2316     if len(pickled_command) > (1 << 20):  # 1M
   2317         # The broadcast will have same life cycle as created PythonRDD

/usr/lib/spark/python/pyspark/serializers.py in dumps(self, obj)
    426 
    427     def dumps(self, obj):
--> 428         return cloudpickle.dumps(obj, 2)
    429 
    430 

/usr/lib/spark/python/pyspark/cloudpickle.py in dumps(obj, protocol)
    655 
    656     cp = CloudPickler(file,protocol)
--> 657     cp.dump(obj)
    658 
    659     return file.getvalue()

/usr/lib/spark/python/pyspark/cloudpickle.py in dump(self, obj)
    105         self.inject_addons()
    106         try:
--> 107             return Pickler.dump(self, obj)
    108         except RuntimeError as e:
    109             if 'recursion' in e.args[0]:

/usr/lib64/python2.7/pickle.pyc in dump(self, obj)
    222         if self.proto >= 2:
    223             self.write(PROTO + chr(self.proto))
--> 224         self.save(obj)
    225         self.write(STOP)
    226 


/usr/lib64/python2.7/pickle.pyc in save(self, obj)
    284         f = self.dispatch.get(t)
    285         if f:
--> 286             f(self, obj) # Call unbound method with explicit self
    287             return
    288 

/usr/lib64/python2.7/pickle.pyc in save_dict(self, obj)
    653 
    654         self.memoize(obj)
--> 655         self._batch_setitems(obj.iteritems())
    656 
    657     dispatch[DictionaryType] = save_dict

/usr/lib64/python2.7/pickle.pyc in _batch_setitems(self, items)
    685                 for k, v in tmp:
    686                     save(k)
--> 687                     save(v)
    688                 write(SETITEMS)
    689             elif n:

/usr/lib64/python2.7/pickle.pyc in save(self, obj)
    304             reduce = getattr(obj, "__reduce_ex__", None)
    305             if reduce:
--> 306                 rv = reduce(self.proto)
    307             else:
    308                 reduce = getattr(obj, "__reduce__", None)

TypeError: 'JavaPackage' object is not callable