Mongodb 在Db上使用Spark Mongo Hadoop读写时,错误状态应为打开
情景: 我有一个从卡夫卡收集的RDD,在映射和还原之后,我想将它写入数据库Mongodb 在Db上使用Spark Mongo Hadoop读写时,错误状态应为打开,mongodb,scala,hadoop,apache-spark,Mongodb,Scala,Hadoop,Apache Spark,情景: 我有一个从卡夫卡收集的RDD,在映射和还原之后,我想将它写入数据库db\u aggregatecollectiondate\u xx 映射时,我需要从该数据库中获取以获得以前的结果 就像我写A一样,我需要B的结果(以前写过)来计算,然后把A写到db 我认为我面临的问题是,当我在向数据库中写入新记录时,读取db_聚合时,db游标可能会通过一个操作“写入”或“读取”关闭 我使用的是Spark 1.4.1 mongo hadoop.1.4.1 mongo 2.6 职能: def getPrev
db\u aggregate
collectiondate\u xx
映射时,我需要从该数据库中获取以获得以前的结果
就像我写A一样,我需要B的结果(以前写过)来计算,然后把A写到db
我认为我面临的问题是,当我在向数据库中写入新记录时,读取db_聚合时,db游标可能会通过一个操作“写入”或“读取”关闭
我使用的是Spark 1.4.1 mongo hadoop.1.4.1 mongo 2.6
职能:
def getPreviousAggregate(campaignId: String, publisher: String, width: Int, height: Int,
date: Int, month: Int, year: Int): BasicBSONObject = {
findLatestAggregate(campaignId, publisher, width, height, date, month, year) match {
case Some(toReturn) => return toReturn
case None => {
println("Not found previous date ....")
val previousDate = Calendar.getInstance();
previousDate.set(year, month, date)
previousDate.add(Calendar.DATE, -1)
val _date = previousDate.get(Calendar.DATE)
val _month = previousDate.get(Calendar.MONTH)
val _year = previousDate.get(Calendar.YEAR)
findLatestAggregate(campaignId, publisher, width, height, _date, _month, _year) match {
case Some(toReturn) => return toReturn
case None => {
}
}
}
}
null
}
def findLatestAggregate(campaignId: String, publisher: String, width: Int, height: Int,
date: Int, month: Int, year: Int): Option[BasicBSONObject] = {
val config = new Configuration()
val outDb = DB_AGGREGATE + "_%02d_%s".format(month, year)
val collName: String = COLL_AGGREGATE + "_%02d".format(date)
val mongoInputUri = "mongodb://%s:%s/%s.%s".format(DB_STATISTIC_HOST, DB_STATISTIC_PORT, outDb, collName)
config.set("mongo.input.uri", mongoInputUri)
try {
val aggregate = sc.newAPIHadoopRDD(config,
classOf[MongoInputFormat],
classOf[Object],
classOf[BSONObject])
val res = aggregate.sortBy(k => k._2.get("timestamp").toString, true).filter(r =>
// Integer.parseInt(r._2.get("timestamp").toString) <= timestamp - BATCH_TIME
// &&
Integer.parseInt(r._2.get("width").toString) == width
&& Integer.parseInt(r._2.get("height").toString) == height
&& r._2.get("publisher").toString == publisher
&& r._2.get("campaignId").toString == campaignId
).map(x => x._2).take(1)
if (res.nonEmpty) {
println("\nfound previous record")
val bson = new BasicBSONObject()
val collect: BSONObject = res(0)
bson.put("totalBudgetSpent", collect.get("totalBudgetSpent"))
bson.put("totalAuctions", collect.get("totalAuctions"))
bson.put("totalWin", collect.get("totalWin"))
return Some(bson)
}
}
catch {
case ex: MongoCommandException => {
println(ex.getMessage)
}
}
None
}
我有一个错误
15/08/27 10:35:44 ERROR Executor: Exception in task 0.0 in stage 19.0 (TID 23)
java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/08/27 10:35:44 WARN TaskSetManager: Lost task 0.0 in stage 19.0 (TID 23, localhost): java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/08/27 10:35:44错误执行者:第19.0阶段任务0.0中出现异常(TID 23)
java.lang.IllegalStateException:状态应为:打开
位于com.mongodb.assertions.assertions.isTrue(assertions.java:70)
位于com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
位于com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource。(ClusterBinding.java:75)
位于com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource。(ClusterBinding.java:71)
位于com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
位于com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
在com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)上
在com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
位于com.mongodb.Mongo.execute(Mongo.java:745)
位于com.mongodb.Mongo$2.execute(Mongo.java:728)
位于com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
位于com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
在com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
位于com.mongodb.hadoop.output.mongoutputcommitter.commitTask(mongoutputcommitter.java:133)
位于org.apache.spark.rdd.pairddfunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(pairddfunctions.scala:1045)
位于org.apache.spark.rdd.pairddfunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(pairddfunctions.scala:1014)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
位于org.apache.spark.scheduler.Task.run(Task.scala:70)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:213)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
运行(Thread.java:745)
15/08/27 10:35:44警告TaskSetManager:在阶段19.0中丢失任务0.0(TID 23,localhost):java.lang.IllegalStateException:状态应为:打开
位于com.mongodb.assertions.assertions.isTrue(assertions.java:70)
位于com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
位于com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource。(ClusterBinding.java:75)
位于com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource。(ClusterBinding.java:71)
位于com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
位于com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
在com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)上
在com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
位于com.mongodb.Mongo.execute(Mongo.java:745)
位于com.mongodb.Mongo$2.execute(Mongo.java:728)
位于com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
位于com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
在com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
位于com.mongodb.hadoop.output.mongoutputcommitter.commitTask(mongoutputcommitter.java:133)
位于org.apache.spark.rdd.pairddfunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(pairddfunctions.scala:1045)
位于org.apache.spark.rdd.pairddfunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(pairddfunctions.scala:1014)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
位于org.apache.spark.scheduler.Task.run(Task.scala:70)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:213)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
运行(Thread.java:745)
15/08/27 10:35:44 ERROR Executor: Exception in task 0.0 in stage 19.0 (TID 23)
java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
15/08/27 10:35:44 WARN TaskSetManager: Lost task 0.0 in stage 19.0 (TID 23, localhost): java.lang.IllegalStateException: state should be: open
at com.mongodb.assertions.Assertions.isTrue(Assertions.java:70)
at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:79)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:175)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:141)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:72)
at com.mongodb.Mongo.execute(Mongo.java:745)
at com.mongodb.Mongo$2.execute(Mongo.java:728)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1968)
at com.mongodb.DBCollection.executeBulkWriteOperation(DBCollection.java:1962)
at com.mongodb.BulkWriteOperation.execute(BulkWriteOperation.java:98)
at com.mongodb.hadoop.output.MongoOutputCommitter.commitTask(MongoOutputCommitter.java:133)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1045)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1014)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)