Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Mysql java.lang.OutofMemorySpace:在pyspark中从数据库获取1.2亿行时的java堆空间_Mysql_Apache Spark_Pyspark - Fatal编程技术网

Mysql java.lang.OutofMemorySpace:在pyspark中从数据库获取1.2亿行时的java堆空间

Mysql java.lang.OutofMemorySpace:在pyspark中从数据库获取1.2亿行时的java堆空间,mysql,apache-spark,pyspark,Mysql,Apache Spark,Pyspark,我对pyspark/ApacheSpark非常陌生。我需要从服务器上的数据库中获取多个表,每个表包含大约1.2亿行或更多行,以便进行分析。我应该能够对数据进行计算。我在一台服务器上运行pyspark,该服务器同时充当主服务器和从服务器,并且有7.45G的RAM。 我已经安装了jdbc驱动程序,这是我使用的代码 from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext.getOrCre

我对pyspark/ApacheSpark非常陌生。我需要从服务器上的数据库中获取多个表,每个表包含大约1.2亿行或更多行,以便进行分析。我应该能够对数据进行计算。我在一台服务器上运行pyspark,该服务器同时充当主服务器和从服务器,并且有7.45G的RAM。 我已经安装了jdbc驱动程序,这是我使用的代码

from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

hostname = "xx.xxx.xx.xx"
dbname = "AAA"
jdbcPort = 3306
username = "xxxxx"
password = "yyyyy"

jdbc_url = "jdbc:mysql://{}:{}/{}?user={}&password={}".format(hostname, jdbcPort, dbname, username, password)
query = "(SELECT * FROM SAMPLE_TABLE_NAME) alias_name"
df = sqlContext.read.format('jdbc').options(driver='com.mysql.jdbc.Driver', url=jdbc_url, dbtable=query).load()
查询加载很好,但当我执行
df.show()
时,它会显示以下内容:

[Stage 0:>                                                          (0 + 1) / 1]20/06/11 11:54:29 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.OutOfMemoryError: Java heap space
    at com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
    at com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
    at com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
    at com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
    at com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
    at com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1858)
    at com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:1966)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:304)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)
20/06/11 11:54:29 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker for task 0,5,main]
java.lang.OutOfMemoryError: Java heap space
    at com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
    at com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
    at com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
    at com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
    at com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
    at com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1858)
    at com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:1966)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:304)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)
20/06/11 11:54:29 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
    at com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
    at com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
    at com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
    at com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
    at com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
    at com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1858)
    at com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:1966)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:304)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)

20/06/11 11:54:29 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/opt/spark/python/pyspark/sql/dataframe.py", line 380, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/opt/spark/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaErrorERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
: <exception str() failed>
[阶段0:>(0+1)/1]20/06/11 11:54:29错误执行者:阶段0.0中任务0.0中的异常(TID 0)
java.lang.OutOfMemoryError:java堆空间
位于com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
位于com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
位于com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
位于com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
位于com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
位于com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
位于com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
位于com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
位于com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1858)
位于com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:1966)
位于org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:304)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
位于org.apache.spark.scheduler.Task.run(Task.scala:123)
位于org.apache.spark.executor.executor$TaskRunner$$anonfun$10.apply(executor.scala:408)
位于org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:414)
位于java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
位于java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
位于java.base/java.lang.Thread.run(Thread.java:834)
20/06/11 11:54:29错误SparkUncaughtExceptionHandler:线程线程中未捕获异常[Executor task launch worker for task 0,5,main]
java.lang.OutOfMemoryError:java堆空间
位于com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
位于com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
位于com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
位于com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
位于com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
位于com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
位于com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
位于com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
位于com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1858)
位于com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:1966)
位于org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:304)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:346)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:310)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
位于org.apache.spark.scheduler.Task.run(Task.scala:123)
位于org.apache.spark.executor.executor$TaskRunner$$anonfun$10.apply(executor.scala:408)
位于org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:414)
位于java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
位于java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
位于java.base/java.lang.Thread.run(Thread.java:834)
20/06/11 11:54:29警告TaskSetManager:在阶段0.0中丢失了任务0.0(TID 0,本地主机,执行器驱动程序):java.lang.OutOfMemoryError:java堆空间
位于com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:2210)
位于com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1989)
位于com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:3410)
位于com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:470)
位于com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:3112)
位于com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:2341)
位于com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2736)
位于com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2484)
在com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement