Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/hadoop/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 多次迭代引发内存不足_Scala_Hadoop_Apache Spark_Hive_Spark Dataframe - Fatal编程技术网

Scala 多次迭代引发内存不足

Scala 多次迭代引发内存不足,scala,hadoop,apache-spark,hive,spark-dataframe,Scala,Hadoop,Apache Spark,Hive,Spark Dataframe,我有一个spark作业(在spark 1.3.1中运行)必须迭代几个键(大约42个)并处理该作业。下面是程序的结构 从地图上取钥匙 从配置单元(下面是hadoop纱线)获取数据,该配置单元将密钥匹配为数据帧 过程数据 将结果写入配置单元 当我运行一个关键点,一切都很好。当我使用42个键运行时,我在第12次迭代时遇到内存不足异常。有没有办法在每次迭代之间清理内存?谢谢你的帮助 下面是我正在使用的高级代码 public abstract class SparkRunnable { public s

我有一个spark作业(在spark 1.3.1中运行)必须迭代几个键(大约42个)并处理该作业。下面是程序的结构

  • 从地图上取钥匙
  • 从配置单元(下面是hadoop纱线)获取数据,该配置单元将密钥匹配为数据帧
  • 过程数据
  • 将结果写入配置单元
  • 当我运行一个关键点,一切都很好。当我使用42个键运行时,我在第12次迭代时遇到内存不足异常。有没有办法在每次迭代之间清理内存?谢谢你的帮助

    下面是我正在使用的高级代码

    public abstract class SparkRunnable {
    
    public static SparkContext sc = null;
    public static JavaSparkContext jsc = null;
    public static HiveContext hiveContext = null;
    public static SQLContext sqlContext = null;
    
    protected SparkRunnableModel(String appName){
        //get the system properties to setup the model
        // Getting a java spark context object by using the constants
        SparkConf conf = new SparkConf().setAppName(appName);
        sc = new SparkContext(conf);
        jsc = new JavaSparkContext(sc);
    
        // Creating a hive context object connection by using java spark
        hiveContext = new org.apache.spark.sql.hive.HiveContext(sc);
    
        // sql context
        sqlContext = new SQLContext(sc);
    
    }
    
    public abstract void processModel(Properties properties) throws Exception;
    
    }
    
    class ModelRunnerMain(model: String) extends SparkRunnableModel(model: String) with Serializable {
    
      override def processModel(properties: Properties) = {
      val dataLoader = DataLoader.getDataLoader(properties)
    
    //loads keys data frame from a keys table in hive and converts that to a list
    val keysList = dataLoader.loadSeriesData()
    
    for (key <- keysList) {
        runModelForKey(key, dataLoader)
    }
    }
    
      def runModelForKey(key: String, dataLoader: DataLoader) = {
    
    //loads data frame from a table(~50 col X 800 rows) using "select * from table where key='<key>'"
    val keyDataFrame = dataLoader.loadKeyData()
    
    // filter this data frame into two data frames
    ...
    
    // join them to transpose
    ...
    
    // convert the data frame into an RDD
    ...
    
    // run map on the RDD to add bunch of new columns
    ...
      }
    
    }
    
    公共抽象类SparkRunnable{
    公共静态SparkContext sc=null;
    公共静态JavaSparkContext jsc=null;
    公共静态HiveContext HiveContext=null;
    公共静态SQLContext SQLContext=null;
    受保护的SparkRunNameModel(字符串appName){
    //获取系统属性以设置模型
    //使用常量获取java spark上下文对象
    SparkConf conf=new SparkConf().setAppName(appName);
    sc=新的SparkContext(conf);
    jsc=新的JavaSparkContext(sc);
    //使用JavaSpark创建配置单元上下文对象连接
    hiveContext=neworg.apache.spark.sql.hive.hiveContext(sc);
    //sql上下文
    sqlContext=新的sqlContext(sc);
    }
    公共抽象void processModel(Properties)抛出异常;
    }
    类ModelRunnerMain(model:String)使用Serializable扩展了SparkRunNameModel(model:String){
    覆盖def processModel(属性:属性)={
    val dataLoader=dataLoader.getDataLoader(属性)
    //从配置单元中的键表加载键数据帧,并将其转换为列表
    val keysList=dataLoader.loadSeriesData()
    
    对于(使用checkpoint()或localCheckpoint()键)可以减少spark沿袭,并在迭代中提高应用程序的性能。

    如果没有看到一些代码重现这个问题,这将很难回答。一般来说,spark应该能够让GC收集不再需要的数据,但魔鬼在细节中……我完全同意@TzachZohar,因此我投了赞成票关闭它,因为它是广泛的,没有一个最小的可验证的完整的例子。谢谢你们。我会添加代码。问题是堆栈是如此的通用,我不知道我应该给出哪一部分。我会提取重要的部分,并将其添加到我的问题。我已经用代码更新了帖子。请看一看。这可能是静态的hiveContext吗有对所有对象的引用吗?我有一个java类,它扩展了这个SparkRunnable模型并执行了类似的步骤。这似乎工作得很好。OOM发生在scala类中。我一定是做错了什么。
    Exception in thread "dag-scheduler-event-loop" java.lang.OutOfMemoryError: Java heap space
    at org.apache.spark.util.io.ByteArrayChunkOutputStream.allocateNewChunkIfNeeded(ByteArrayChunkOutputStream.scala:66)
    at org.apache.spark.util.io.ByteArrayChunkOutputStream.write(ByteArrayChunkOutputStream.scala:55)
    at com.ning.compress.lzf.ChunkEncoder.encodeAndWriteChunk(ChunkEncoder.java:264)
    at com.ning.compress.lzf.LZFOutputStream.writeCompressedBlock(LZFOutputStream.java:266)
    at com.ning.compress.lzf.LZFOutputStream.write(LZFOutputStream.java:124)
    at com.esotericsoftware.kryo.io.Output.flush(Output.java:155)
    at com.esotericsoftware.kryo.io.Output.require(Output.java:135)
    at com.esotericsoftware.kryo.io.Output.writeBytes(Output.java:220)
    at com.esotericsoftware.kryo.io.Output.writeBytes(Output.java:206)
    at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ByteArraySerializer.write(DefaultArraySerializers.java:29)
    at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ByteArraySerializer.write(DefaultArraySerializers.java:18)
    at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:568)
    at org.apache.spark.serializer.KryoSerializationStream.writeObject(KryoSerializer.scala:124)
    at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:202)
    at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:101)
    at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84)
    at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
    at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29)
    at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62)
    at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:839)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply$mcVI$sp(DAGScheduler.scala:1042)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1039)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1038)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1038)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1390)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)