Mongodb Java Spark Mongo:filter(dataset.col(newTime)。$greater(oldTime))未在完整数据集上运行
我已经用Mongo连接器编写了一个Java Spark代码。它应该从MongoDB中获取所有这些行,其中列Mongodb Java Spark Mongo:filter(dataset.col(newTime)。$greater(oldTime))未在完整数据集上运行,mongodb,apache-spark,apache-spark-sql,Mongodb,Apache Spark,Apache Spark Sql,我已经用Mongo连接器编写了一个Java Spark代码。它应该从MongoDB中获取所有这些行,其中列createdDate大于上一次运行的createdDate(就像我存储在Oracle中的每个运行的最大高水位线值。Oracle中的高水位线值最初是1900-01-01 00:00:00.000)。 此列createdDate在mongoDB中是ISODate类型 在我的MongoDB数据中,此列createdDate存储的最大值为2018-04-11 01:43:20.165 但代码中的过
createdDate
大于上一次运行的createdDate
(就像我存储在Oracle中的每个运行的最大高水位线值。Oracle中的高水位线值最初是1900-01-01 00:00:00.000
)。此列
createdDate
在mongoDB中是ISODate
类型
在我的MongoDB数据中,此列createdDate
存储的最大值为2018-04-11 01:43:20.165
但代码中的
过滤器
未按预期工作,即在第一次运行时,有时会提取到2018-03-30 21:48:59.519
,然后在第二次或第三次运行时,提取到最大值(2018-04-11 01:43:20.165
)理想情况下,当初始高水位线值为
1900-01….
代码如下:
package mongo;
import java.net.URI;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.sql.Date;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.bson.Document;
import static org.apache.spark.sql.functions.*;
import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;
import java.sql.Timestamp;
public final class MongoRead
{
private static Connection con=null;
private static String readHighWaterMark(String table, String oraConn, String oraUser, String oraPswd) throws Exception
{
String highWaterMarkValue = "";
try
{
con=DriverManager.getConnection(oraConn,oraUser,oraPswd);
Statement stmt=con.createStatement();
ResultSet rs=stmt.executeQuery("select * from difa.HIGH_WATER_MARK_TABLE where table_nm='"+table+"'");
while(rs.next()){
highWaterMarkValue = rs.getString(3);
}
}
catch(Exception e){
e.printStackTrace();
con.close();
}
return highWaterMarkValue;
}
private static void setHighWaterMark(String key, String value) throws Exception
{
PreparedStatement pStmt=con.prepareStatement("UPDATE high_water_mark_table SET high_water_mark_VALUE='"+value+"' where table_nm='"+key+"'");
int i=pStmt.executeUpdate();
System.out.println(i+" records updated");
}
public static void main(final String[] args) throws Exception {
if(args.length<8){
System.out.println("Please provide correct inputs");
System.exit(1);
}
String mongoAddress = args[0];
String clusterAddress = args[1];
String oraConn = args[2];
String oraUser = args[3];
String oraPswd = args[4];
String tableNm = args[5];
String highWaterCol = args[6];
String loadType = args[7];
SparkSession spark = SparkSession.builder()
.master("local")
.appName("MongoSparkRecordReader")
.config("spark.mongodb.input.uri", mongoAddress)
.config("spark.mongodb.output.uri", mongoAddress)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
try{
FileSystem fs = FileSystem.get(new URI(clusterAddress),jsc.hadoopConfiguration());
fs.delete(new Path(clusterAddress),true);
}
catch(Exception e){
e.printStackTrace();
}
/* ********Read data from MongoDB******* */
Dataset<Row> dataset = MongoSpark.load(jsc).toDF();
if(loadType.equalsIgnoreCase("I")){
String highWaterMark = readHighWaterMark(tableNm,oraConn,oraUser,oraPswd);
System.out.println("============HIGH_WATER_MARK_VALUE: "+highWaterMark);
Timestamp oldTime = Timestamp.valueOf(highWaterMark.replace("T"," ").replace("Z", ""));
//Fetches records that where createdDate is greater than previous high Water Mark.
Dataset<Row> filtered = dataset.filter(dataset.col(highWaterCol).$greater(oldTime)).persist();
filtered.toJSON().write().text(clusterAddress);
//Calculating the MAX(createdDate) in the fetched dataset.
Dataset<Row> maxHighWaterRow = filtered.agg(max(filtered.col(highWaterCol)).alias("newHighWater")).persist();
List<Timestamp> newHighWaterValue = maxHighWaterRow.select("newHighWater").as(Encoders.TIMESTAMP()).collectAsList();
Timestamp maxHighWaterMarkValue = newHighWaterValue.iterator().next();
SimpleDateFormat dtFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
Timestamp oldDate = Timestamp.valueOf(highWaterMark.replace('T', ' ').replace("Z",""));
//Setting HIGH_WATER_MARK_VALUE if a greater value is detected.
if(maxHighWaterMarkValue !=null && maxHighWaterMarkValue.after(oldDate)){
setHighWaterMark(tableNm,dtFormat.format(maxHighWaterMarkValue).replace(" ", "T").concat("Z"));
}
}
else{
dataset.toJSON().write().text(clusterAddress);
}
con.close();
jsc.close();
}
}
packagemongo;
导入java.net.URI;
导入java.sql.Connection;
导入java.sql.DriverManager;
导入java.sql.PreparedStatement;
导入java.sql.ResultSet;
导入java.sql.Statement;
导入java.text.ParsePosition;
导入java.text.simpleDataFormat;
导入java.sql.Date;
导入java.util.Iterator;
导入java.util.List;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.JavaSparkContext;
导入org.apache.spark.sql.SparkSession;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop.fs.Path;
导入org.bson.Document;
导入静态org.apache.spark.sql.functions.*;
导入org.apache.spark.sql.DataFrameWriter;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Encoders;
导入org.apache.spark.sql.Row;
导入com.mongodb.spark.MongoSpark;
导入com.mongodb.spark.rdd.api.java.JavaMongoRDD;
导入java.sql.Timestamp;
蒙哥雷德公共期末班
{
私有静态连接con=null;
私有静态字符串readHighWaterMark(字符串表、字符串oraConn、字符串oraUser、字符串oraPswd)引发异常
{
字符串highWaterMarkValue=“”;
尝试
{
con=DriverManager.getConnection(oraConn、oraUser、oraPswd);
语句stmt=con.createStatement();
结果集rs=stmt.executeQuery(“从difa.HIGH\U WATER\U MARK\U表格中选择*,其中表格\U nm='“+表格+”);
while(rs.next()){
highWaterMarkValue=rs.getString(3);
}
}
捕获(例外e){
e、 printStackTrace();
con.close();
}
返回高水位线值;
}
私有静态void setHighWaterMark(字符串键、字符串值)引发异常
{
PreparedStatement pStmt=con.prepareStatement(“更新高位水位线表设置高位水位线值=””“+VALUE+”,其中表水位线=”“+key+”);
int i=pStmt.executeUpdate();
系统输出打印项次(i+“记录更新”);
}
公共静态void main(最终字符串[]args)引发异常{
如果(args.length我通过为数据集添加.persist()
修复了此问题:
/* ********Read data from MongoDB******* */
Dataset<Row> dataset = MongoSpark.load(jsc).toDF().persist();
....
..
...
Dataset<Row> filtered = dataset.filter(dataset.col(highWaterCol).$greater(old)).persist();
/*******从MongoDB读取数据********/
Dataset Dataset=MongoSpark.load(jsc.toDF().persist();
....
..
...
Dataset filtered=Dataset.filter(Dataset.col(highWaterCol)。$greater(old)).persist();
我不知道为什么没有persist()
过滤器就不能在整个数据集上运行