Mongodb RDD仅部分写入mongo

Mongodb RDD仅部分写入mongo,mongodb,hadoop,apache-spark,Mongodb,Hadoop,Apache Spark,我正在使用Spark 1.3.1,并尝试使用版本1.3.2和mongo java驱动程序版本3.0.1将RDD保存到mongodb。当我在独立集群上运行下面的应用程序时,驱动程序被标记为失败 这是我用来重现这个问题的代码 import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.hadoop.con

我正在使用Spark 1.3.1,并尝试使用版本1.3.2和mongo java驱动程序版本3.0.1将RDD保存到mongodb。当我在独立集群上运行下面的应用程序时,驱动程序被标记为失败

这是我用来重现这个问题的代码

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._

import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD

import org.bson.BasicBSONObject
import org.bson.BSONObject

object TestApp {

  def testSaveRddToMongo() {
    val sparkConf = new SparkConf().setAppName("Test")
    val sc = new SparkContext(sparkConf)

    val mongoConfig = new Configuration()
    mongoConfig.set("mongo.job.input.format","com.mongodb.hadoop.MongoInputFormat")
    mongoConfig.set("mongo.input.uri", "mongodb://some.local.ip:27017/mydb.input")

    val bsonRDD: RDD[(Object, BSONObject)] = sc.newAPIHadoopRDD(mongoConfig, classOf[com.mongodb.hadoop.MongoInputFormat], classOf[Object], classOf[BSONObject])

    val reasons: RDD[String] = bsonRDD.map( tuple => {
      tuple._2.asInstanceOf[BasicBSONObject].getString("fieldName").trim
      }).distinct().cache()

    val out: RDD[(String,Int)] = reasons.zipWithIndex().map { case (k,v) => (k,v.toInt)}

    println (s"Saving ${out.count} elements")
    val outputConfig = new Configuration()
    outputConfig.set("mongo.job.output.format","com.mongodb.hadoop.MongoOutputFormat")
    outputConfig.set("mongo.output.uri", "mongodb://some.local.ip:27017/mydb.garbage")
    out.saveAsNewAPIHadoopFile("file:///bogus", classOf[Any], classOf[Any], classOf[com.mongodb.hadoop.MongoOutputFormat[Any, Any]], outputConfig)
  }

  def main(args: Array[String]) {
    testSaveRddToMongo()
  }
}
从司机那里,我看到了这个

    15/05/15 14:18:43 INFO DAGScheduler: Job 2 failed: saveAsNewAPIHadoopFile at Test.scala:39, took 6.491961 s
    Exception in thread "main" java.lang.reflect.InvocationTargetException
            at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
            at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
            at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
            at java.lang.reflect.Method.invoke(Method.java:497)
            at org.apache.spark.deploy.worker.DriverWrapper$.main(DriverWrapper.scala:59)
            at org.apache.spark.deploy.worker.DriverWrapper.main(DriverWrapper.scala)
    Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 5.0 failed 4 times, most recent failure: Lost task 3.3 in stage 5.0 (TID 275, largo-ubuntu): 
java.lang.IllegalStateException: The pool is closed
            at com.mongodb.internal.connection.ConcurrentPool.get(ConcurrentPool.java:123)
            at com.mongodb.connection.DefaultConnectionPool.getPooledConnection(DefaultConnectionPool.java:243)
            at com.mongodb.connection.DefaultConnectionPool.get(DefaultConnectionPool.java:90)
            at com.mongodb.connection.DefaultConnectionPool.get(DefaultConnectionPool.java:80)
            at com.mongodb.connection.DefaultServer.getConnection(DefaultServer.java:69)
            at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.getConnection(ClusterBinding.java:86)
            at com.mongodb.operation.OperationHelper.withConnectionSource(OperationHelper.java:184)
            at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:177)
            at com.mongodb.operation.BaseWriteOperation.execute(BaseWriteOperation.java:106)
            at com.mongodb.operation.BaseWriteOperation.execute(BaseWriteOperation.java:58)
            at com.mongodb.Mongo.execute(Mongo.java:745)
            at com.mongodb.Mongo$2.execute(Mongo.java:728)
            at com.mongodb.DBCollection.executeWriteOperation(DBCollection.java:327)
            at com.mongodb.DBCollection.replaceOrInsert(DBCollection.java:405)
            at com.mongodb.DBCollection.save(DBCollection.java:394)
            at com.mongodb.DBCollection.save(DBCollection.java:367)
            at com.mongodb.hadoop.output.MongoRecordWriter.write(MongoRecordWriter.java:105)
            at org.apache.spark.rdd.PairRDDFunctions$$anonfun$12.apply(PairRDDFunctions.scala:1000)
            at org.apache.spark.rdd.PairRDDFunctions$$anonfun$12.apply(PairRDDFunctions.scala:979)
            at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
            at org.apache.spark.scheduler.Task.run(Task.scala:64)
            at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
            at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
            at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
            at java.lang.Thread.run(Thread.java:745)

    Driver stacktrace:
            at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
            at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
            at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
            at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
            at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
            at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
            at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
            at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
            at scala.Option.foreach(Option.scala:236)
            at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
            at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
            at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
            at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
为什么连接被关闭了?在其他地方有我没有看到的例外情况吗

修复

根据下面的maasg,使用casbah编写结果是可行的。我更新了代码如下:

import com.mongodb.casbah.Imports._
...
    println (s"Saving ${out.count} elements")
    val uri = MongoClientURI("mongodb://some.local.ip:27017/mydb.garbage")
    val mongoClient = MongoClient(uri)
    val collection = mongoClient(uri.database.get)(uri.collection.get)
    collection.drop()
    val builder = collection.initializeUnorderedBulkOperation
    for ((value, index) <- out.collect()) { builder.insert(MongoDBObject(("_id" -> value), ("value" -> index))) }
    builder.execute()
import com.mongodb.casbah.Imports_
...
println(s“保存${out.count}个元素”)
val uri=MongoClientURI(“mongodb://some.local.ip:27017/mydb.garbage")
val mongoClient=mongoClient(uri)
val collection=mongoClient(uri.database.get)(uri.collection.get)
collection.drop()
val builder=collection.initializeUnderedBulkOperation
对于((值,索引)值),((值->索引))}
builder.execute()
更好的修复

这里有一个更好的版本,它将对每个分区进行一次批写入

...
  def dropCollection(uriString: String) {
    val uri = MongoClientURI(uriString)
    val mongoClient = MongoClient(uri)
    val collection = mongoClient(uri.database.get)(uri.collection.get)
    mongoClient.close()
  }

  def saveReultsToMongo(out: RDD[(String,Int)], uriString: String) {
    out.foreachPartition( itr => {
      val uri = MongoClientURI(uriString)
      val mongoClient = MongoClient(uri)
      val collection = mongoClient(uri.database.get)(uri.collection.get)
      val builder = collection.initializeUnorderedBulkOperation
      for ( (value, index) <- itr ){ builder.insert(MongoDBObject(("_id" -> value), ("value" -> index))) }
      builder.execute
      mongoClient.close
      })
  }
...
    println (s"Saving ${out.count} elements")
    dropCollection("mongodb://10.22.128.84:27017/Minerva.garbage")
    saveReultsToMongo(out, "mongodb://10.22.128.84:27017/Minerva.garbage")
。。。
def dropCollection(uriString:String){
val uri=MongoClientURI(uriString)
val mongoClient=mongoClient(uri)
val collection=mongoClient(uri.database.get)(uri.collection.get)
mongoClient.close()
}
def saveReultsToMongo(out:RDD[(String,Int)],uriString:String){
out.foreachPartition(itr=>{
val uri=MongoClientURI(uriString)
val mongoClient=mongoClient(uri)
val collection=mongoClient(uri.database.get)(uri.collection.get)
val builder=collection.initializeUnderedBulkOperation
对于((值,索引)值),((值->索引))}
builder.execute
mongoClient.close
})
}
...
println(s“保存${out.count}个元素”)
dropCollection(“mongodb://10.22.128.84:27017/Minerva.garbage")
saveReultsToMongo(外,“mongodb://10.22.128.84:27017/Minerva.garbage")
几张便条

  • out.foreach{case(value,index)=>builder.insert(MongoDBObject((“\u id”->value),(“value”->index))}
    无法工作,因为BulkWriteOperation不可序列化
    • 但是,
      out.foreachPartition
      可以根据maasg和更好的修复程序使用
  • casbah 1.8.1与mongo java驱动程序3.0.x不兼容,它使用2.13.1

在版本1.4发布之前,hadoop mongo连接器在使用Spark时不可靠。高并行负载将泄漏客户端连接,导致故障。在我们的案例中,这个bug是关键点: 如您所见,它已合并到1.4版本中


作为一种解决方法,我建议使用+批处理操作(Java客户端的scala包装器)。

执行者有各自的日志。你肯定应该看看那里,但不确定你是否会找到一个信息更丰富的堆栈跟踪。我在这里检查了日志:
spark/work/app-201505141801-0008$cat*/stderr | less
,但是,我没有看到任何其他信息。@Russell使用
rdd.foreachPartition{…}
。而不是foreach。。您还应该在foreachPartition闭包中建立并关闭db连接。然后,它将在包含该rdd分区的每个节点上并行执行。@maasg。我也用新版本更新了这篇文章。