Python 加入Spark后收集时出错

Python 加入Spark后收集时出错,python,apache-spark,pyspark,Python,Apache Spark,Pyspark,我用的是Spark 1.4.1。我有几个数据帧,我想加入一个userid字段。连接似乎有效,因为我可以看到模式并打印出将有多少行的计数(本例中为216行): usersearch\u jnd=rr.join(uu,rr.searcher\u id==uu.userid,'inner') 打印(用户搜索) 打印(usersearch\u jnd.count()) usersearch_jnd.printSchema() 数据帧[最小年龄:int,最大年龄:int,中间年龄:int,缩放:int,搜

我用的是Spark 1.4.1。我有几个数据帧,我想加入一个userid字段。连接似乎有效,因为我可以看到模式并打印出将有多少行的计数(本例中为216行):

usersearch\u jnd=rr.join(uu,rr.searcher\u id==uu.userid,'inner')
打印(用户搜索)
打印(usersearch\u jnd.count())
usersearch_jnd.printSchema()
数据帧[最小年龄:int,最大年龄:int,中间年龄:int,缩放:int,搜索者种族:数组,搜索者id:int,搜索者性别:int,偏移量:int,搜索对象id:int,搜索对象loc:struct,用户id:int,搜索用户感兴趣:int,搜索级别:int,搜索最大年龄:int,搜索最小年龄:int,搜索种族:int,搜索多:字符串,搜索查找性别:int,兴趣d_in:int,搜索距离:float,种族:int]
216
根
|--最小年龄:整数(可空=真)
|--最大年龄:整数(可空=真)
|--inter_in:integer(nullable=true)
|--缩放:整数(nullable=true)
|--搜索者:数组(nullable=true)
||--元素:长(containsnall=true)
|--搜索者id:integer(nullable=true)
|--searcher_sex:integer(nullable=true)
|--偏移量:整数(nullable=true)
|--searcheee_id:integer(nullable=true)
|--searcheee_loc:struct(nullable=true)
||--纬度:双(可空=真)
||--经度:双精度(可空=真)
|--userid:integer(nullable=true)
|--搜索感兴趣的用户:整数(nullable=true)
|--搜索级别:整数(nullable=true)
|--搜索最大年龄:整数(nullable=true)
|--搜索最小年龄:整数(可空=真)
|--搜索\u种族\u多重:字符串(可空=真)
|--搜索查找性别:整数(nullable=true)
|--对:整数感兴趣(nullable=true)
|--搜索距离:浮点(可空=真)
|--种族:整数(nullable=true)
但是,当我做一些像collect()或head()这样简单的事情时,我会遇到错误:

usersearch_jnd.collect()
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-191-86046817e0ea> in <module>()
----> 1 usersearch_jnd.collect()

/Users/evanzamir/spark-1.4.1/python/pyspark/sql/dataframe.pyc in collect(self)
    279         """
    280         with SCCallSiteSync(self._sc) as css:
--> 281             port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
    282         rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
    283         cls = _create_cls(self.schema)

/Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 307.0 failed 1 times, most recent failure: Lost task 1.0 in stage 307.0 (TID 2628, localhost): scala.MatchError: (VALUE_STRING,IntegerType) (of class scala.Tuple2)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:49)
    at org.apache.spark.sql.json.JacksonParser$.convertObject(JacksonParser.scala:137)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:109)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:117)
    at org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:201)
    at org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:193)
    at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at org.apache.spark.util.collection.WritablePartitionedIterator$$anon$3.writeNext(WritablePartitionedPairCollection.scala:105)
    at org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:375)
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:208)
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:695)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
usersearch\u jnd.collect()
Py4JJavaError回溯(最近一次调用)
在()
---->1 usersearch_jnd.collect()
/collect(self)中的Users/evanzamir/spark-1.4.1/python/pyspark/sql/dataframe.pyc
279         """
280将SCCallSiteSync(self.\u sc)作为css:
-->281 port=self.\u sc.\u jvm.PythonRDD.collectAndServe(self.\u jdf.javaToPython().rdd())
282 rs=list(\u从\u套接字(端口,BatchedSerializer(PickleSerializer()))加载)
283 cls=\u create\u cls(self.schema)
/Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in_u___调用(self,*args)
536 answer=self.gateway\u client.send\u命令(command)
537返回值=获取返回值(应答,self.gateway\u客户端,
-->538 self.target_id,self.name)
539
540对于临时参数中的临时参数:
/获取返回值中的Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py(答案、网关客户端、目标id、名称)
298 raise Py4JJavaError(
299'调用{0}{1}{2}时出错。\n'。
-->300格式(目标id,,,,名称),值)
301其他:
302升起Py4JError(
Py4JJavaError:调用z:org.apache.spark.api.python.PythonRDD.collectAndServe时出错。
:org.apache.spark.sparkeexception:作业因阶段失败而中止:阶段307.0中的任务1失败1次,最近的失败:阶段307.0中的任务1.0丢失(TID 2628,localhost):scala.MatchError:(值_字符串,整数类型)(属于scala.Tuple2类)
位于org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:49)
位于org.apache.spark.sql.json.JacksonParser$.convertObject(JacksonParser.scala:137)
位于org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:109)
位于org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:117)
位于org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:201)
位于org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:193)
位于scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
在scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
在scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
在scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
位于org.apache.spark.util.collection.WritablePartitionEditor$$anon$3.writeNext(WritablePartitionedPairCollection.scala:105)
位于org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:375)
位于org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:208)
位于org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
位于org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)上
位于org.apache.spark.scheduler.Task.run(Task.scala:70)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:213)
位于java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
运行(Thread.java:695)
驱动程序堆栈跟踪:
位于org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
位于scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59)
位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
在org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler。
usersearch_jnd.collect()
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-191-86046817e0ea> in <module>()
----> 1 usersearch_jnd.collect()

/Users/evanzamir/spark-1.4.1/python/pyspark/sql/dataframe.pyc in collect(self)
    279         """
    280         with SCCallSiteSync(self._sc) as css:
--> 281             port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
    282         rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
    283         cls = _create_cls(self.schema)

/Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/Users/evanzamir/spark-1.4.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 307.0 failed 1 times, most recent failure: Lost task 1.0 in stage 307.0 (TID 2628, localhost): scala.MatchError: (VALUE_STRING,IntegerType) (of class scala.Tuple2)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:49)
    at org.apache.spark.sql.json.JacksonParser$.convertObject(JacksonParser.scala:137)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:109)
    at org.apache.spark.sql.json.JacksonParser$.convertField(JacksonParser.scala:117)
    at org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:201)
    at org.apache.spark.sql.json.JacksonParser$$anonfun$parseJson$1$$anonfun$apply$1.apply(JacksonParser.scala:193)
    at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
    at org.apache.spark.util.collection.WritablePartitionedIterator$$anon$3.writeNext(WritablePartitionedPairCollection.scala:105)
    at org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:375)
    at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:208)
    at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:695)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)