Python unionAll导致堆栈溢出
关于StackOverflow,我自己的问题()取得了一些进展,但我收到了一个StackOverflow错误:Python unionAll导致堆栈溢出,python,apache-spark,pyspark,Python,Apache Spark,Pyspark,关于StackOverflow,我自己的问题()取得了一些进展,但我收到了一个StackOverflow错误: import requests import numpy as np import pandas as pd import sys if sys.version_info[0] < 3: from StringIO import StringIO else: from io import StringIO from pyspark.sql import SQ
import requests
import numpy as np
import pandas as pd
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
chunk_size = 1024
url = "https://{0}:8443/gateway/default/webhdfs/v1/{1}?op=OPEN".format(host, filepath)
r = requests.get(url, auth=(username, password),
verify=False, allow_redirects=True,
stream=True)
df = None
curr_line = 1
remainder = ''
for chunk in r.iter_content(chunk_size):
txt = remainder + chunk
[lines, remainder] = txt.rsplit('\n', 1)
pdf = pd.read_csv(StringIO(lines), sep='|', header=None)
if df == None:
df = sqlContext.createDataFrame(pdf)
else:
df = df.unionAll(sqlContext.createDataFrame(pdf))
print df.count()
导入请求
将numpy作为np导入
作为pd进口熊猫
导入系统
如果系统版本信息[0]<3:
从StringIO导入StringIO
其他:
从io导入StringIO
从pyspark.sql导入SQLContext
sqlContext=sqlContext(sc)
块大小=1024
url=“https://{0}:8443/gateway/default/webhdfs/v1/{1}?op=OPEN”。格式(主机,文件路径)
r=requests.get(url,auth=(用户名,密码),
verify=False,allow_redirects=True,
流=真)
df=无
当前线路=1
余数=“”
对于r.iter\u内容中的块(块大小):
txt=余数+块
[行,余数]=txt.rsplit('\n',1)
pdf=pd.read_csv(StringIO(行),sep='|',header=None)
如果df==无:
df=sqlContext.createDataFrame(pdf)
其他:
df=df.unionAll(sqlContext.createDataFrame(pdf))
打印df.count()
stacktrace在这里:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-4-b3a89df3c7d8> in <module>()
36 df = sqlContext.createDataFrame(pdf)
37 else:
---> 38 df = df.unionAll(sqlContext.createDataFrame(pdf))
39
40 #curr_line = curr_line + 1
/usr/local/src/spark160master/spark/python/pyspark/sql/dataframe.py in unionAll(self, other)
993 This is equivalent to `UNION ALL` in SQL.
994 """
--> 995 return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
996
997 @since(1.3)
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/local/src/spark160master/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o19563.unionAll.
: java.lang.StackOverflowError
---------------------------------------------------------------------------
Py4JJavaError回溯(最近一次调用)
在()
36 df=sqlContext.createDataFrame(pdf)
37.其他:
--->38 df=df.unionAll(sqlContext.createDataFrame(pdf))
39
40#当前行=当前行+1
/unionAll中的usr/local/src/spark160master/spark/python/pyspark/sql/dataframe.py(self、other)
993这相当于SQL中的“UNION ALL”。
994 """
-->995返回数据帧(self.\u jdf.unionAll(其他.\u jdf)、self.sql\u ctx)
996
997@自(1.3)
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in_uuu调用(self,*args)
811 answer=self.gateway\u client.send\u命令(command)
812返回值=获取返回值(
-->813应答,self.gateway\u客户端,self.target\u id,self.name)
814
815对于临时参数中的临时参数:
/装饰中的usr/local/src/spark160master/spark/python/pyspark/sql/utils.py(*a,**kw)
43 def装饰(*a,**千瓦):
44尝试:
--->45返回f(*a,**kw)
46除py4j.protocol.Py4JJavaError外,错误为e:
47 s=e.java_exception.toString()
/获取返回值中的usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py(答案、网关客户端、目标id、名称)
306 raise PY4JJAVA错误(
307“调用{0}{1}{2}时出错。\n”。
-->308格式(目标id,“.”,名称),值)
309其他:
310升起Py4JError(
Py4JJavaError:调用o19563.unionAll时出错。
:java.lang.StackOverflower错误
我不知道如何解决这个问题。任何提示都值得赞赏。在不控制分区数量的情况下,不应该迭代合并分布式数据结构。您会找到一个完整的解释,但不幸的是,
数据帧
有点棘手:
dfs=…#pyspark.sql.DataFrame的列表
def unionAll(*dfs):
如果不是dfs:
提升值错误()
first=dfs[0]
返回df.sql\u ctx.createDataFrame(
df._sc.union([df.rdd表示dfs中的df]),first.schema
)
unionAll(*dfs)