JavaRDD函数的性能调优
我想使用Java和Spark 1.6版将dataframe转换为Json数组,我将从中转换数据 数据帧->Json->RDD->数组 数据看起来像这样JavaRDD函数的性能调优,java,json,performance,apache-spark,rdd,Java,Json,Performance,Apache Spark,Rdd,我想使用Java和Spark 1.6版将dataframe转换为Json数组,我将从中转换数据 数据帧->Json->RDD->数组 数据看起来像这样 [ { "prtdy_pgm_x":"P818_C", "prtdy_pgm_x":"P818", "prtdy_attr_c":"Cost", "prtdy_integer_r":0, "prtdy_cds_d":"prxm", "prtdy_created_
[
{
"prtdy_pgm_x":"P818_C",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Cost",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:19.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
{
"prtdy_pgm_x":"P818_I",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Tooling",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:20.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
{
"prtdy_pgm_x":"P818_W",
"prtdy_pgm_x":"P818",
"prtdy_attr_c":"Weight",
"prtdy_integer_r":0,
"prtdy_cds_d":"prxm",
"prtdy_created_s":"2018-05-12 04:12:20.0",
"prtdy_created_by_c":"brq",
"prtdy_create_proc_x":"w_pprtdy_security_t",
"snapshot_d":"2018-05-12-000018"
},
......
]
所以我写了类似这样的代码
if(cmnTableNames != null && cmnTableNames.length > 0)
{
for(int i=0; i < cmnTableNames.length; i++)
{
String cmnTableName = cmnTableNames[i];
DataFrame cmnTableContent = null;
if(cmnTableName.contains("PTR_security_t"))
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName + " where fbrn04_snapshot_d = '" + snapshotId + "'");
}
else
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName);
}
String cmnTable = cmnTableName.substring(cmnTableName.lastIndexOf(".") + 1);
if (cmnTableContent.count() > 0)
{
String cmnStgTblDir = hdfsPath + "/staging/" + rptName + "/common/" + cmnTable;
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
String result = cmnTblCntJson.reduce((ob1, ob2) -> (String)ob1+","+(String)ob2); //This Part, takes more time than usual contains large set of data.
String output = "["+result+"]";
ArrayList<String> outputList = new ArrayList<String>();
outputList.add(output);
JavaRDD<String> finalOutputRDD = sc.parallelize(outputList);
String cmnStgMrgdDir = cmnStgTblDir + "/mergedfile";
if(dfs.exists(new Path(cmnStgTblDir + "/mergedfile"))) dfs.delete(new Path(cmnStgTblDir + "/mergedfile"), true);
finalOutputRDD.coalesce(1).saveAsTextFile(cmnStgMrgdDir, GzipCodec.class);
fileStatus = dfs.getFileStatus(new Path(cmnStgMrgdDir + "/part-00000.gz"));
dfs.setPermission(fileStatus.getPath(),FsPermission.createImmutable((short) 0770));
dfs.rename(new Path(cmnStgMrgdDir + "/part-00000.gz"), new Path(CommonPath + "/" + cmnTable + ".json.gz"));
}
else
{
System.out.println("There are no records in " + cmnTableName);
}
}
}
else
{
System.out.println("The common table lists are null.");
}
sc.stop();
if(cmnTableNames!=null&&cmnTableNames.length>0)
{
对于(int i=0;i0)
{
字符串cmnStgTblDir=hdfsPath+“/staging/”+rptName+“/common/”+cmnTable;
JavaRDD cmnTblCntJson=cmnTableContent.toJSON().toJavaRDD();
String result=cmnTblCntJson.reduce((ob1,ob2)->(String)ob1+“,”+(String)ob2);//此部分比通常包含大量数据所需的时间更长。
字符串输出=“[”+结果+“]”;
ArrayList outputList=新的ArrayList();
outputList.add(输出);
JavaRDD finaloutpurdd=sc.parallelize(输出列表);
字符串cmnStgMrgdDir=cmnStgTblDir+“/mergedfile”;
如果(dfs.exists(新路径(cmnStgTblDir+“/mergedfile”))dfs.delete(新路径(cmnStgTblDir+”/mergedfile)),则为true;
coalesce(1).saveAsTextFile(cmnStgMrgdDir,GzipCodec.class);
fileStatus=dfs.getFileStatus(新路径(cmnStgMrgdDir+“/part-00000.gz”);
setPermission(fileStatus.getPath(),FsPermission.createImmutable((short)0770));
重命名(新路径(cmnStgMrgdDir+“/part-00000.gz”)、新路径(CommonPath+“/”+cmnTable+“.json.gz”);
}
其他的
{
System.out.println(“在”+cmnTableName中没有记录);
}
}
}
其他的
{
System.out.println(“公共表列表为空”);
}
sc.停止();
但是,当应用reduce函数时,需要花费更多的时间
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
JavaRDD cmnTblCntJson=cmnTableContent.toJSON().toJavaRDD();
String result=cmnTblCntJson.reduce((ob1,ob2)->(String)ob1+“,”+(String)ob2)//这一部分比通常包含大量数据需要更多的时间
分区为“PTR_security_t”的表非常庞大,与其他没有分区的表相比需要花费大量时间(588kb的表需要40-50分钟)
我尝试应用Lambda,但最终出现了Task not serializable错误。检查下面的代码
if(cmnTableNames != null && cmnTableNames.length > 0)
{
List<String> commonTableList = Arrays.asList(cmnTableNames);
DataFrame commonTableDF = sqc.createDataset(commonTableList,Encoders.STRING()).toDF();
commonTableDF.toJavaRDD().foreach(cmnTableNameRDD -> {
DataFrame cmnTableContent = null;
String cmnTableName = cmnTableNameRDD.mkString();
if(cmnTableName.contains("PTR_security_t"))
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName + " where fbrn04_snapshot_d = '" + snapshotId + "'");
}
else
{
cmnTableContent = hiveContext.sql("SELECT * FROM " + cmnTableName);
}
String cmnTable = cmnTableName.substring(cmnTableName.lastIndexOf(".") + 1);
if (cmnTableContent.count() > 0)
{
String cmnStgTblDir = hdfsPath + "/staging/" + rptName + "/common/" + cmnTable;
JavaRDD<String> cmnTblCntJson = cmnTableContent.toJSON().toJavaRDD();
String result = cmnTblCntJson.reduce((ob1, ob2) -> (String)ob1+","+(String)ob2);
String output = "["+result+"]";
ArrayList<String> outputList = new ArrayList<String>();
outputList.add(output);
JavaRDD<String> finalOutputRDD = sc.parallelize(outputList);
String cmnStgMrgdDir = cmnStgTblDir + "/mergedfile";
if(dfs.exists(new Path(cmnStgTblDir + "/mergedfile"))) dfs.delete(new Path(cmnStgTblDir + "/mergedfile"), true);
finalOutputRDD.coalesce(1).saveAsTextFile(cmnStgMrgdDir, GzipCodec.class);
fileStatus = dfs.getFileStatus(new Path(cmnStgMrgdDir + "/part-00000.gz"));
dfs.setPermission(fileStatus.getPath(),FsPermission.createImmutable((short) 0770));
dfs.rename(new Path(cmnStgMrgdDir + "/part-00000.gz"), new Path(CommonPath + "/" + cmnTable + ".json.gz"));
}
else
{
System.out.println("There are no records in " + cmnTableName);
}
});
}
else
{
System.out.println("The common table lists are null.");
}
sc.stop();
if(cmnTableNames!=null&&cmnTableNames.length>0)
{
List commonTableList=Arrays.asList(cmnTableNames);
DataFrame commonTableDF=sqc.createDataset(commonTableList,Encoders.STRING()).toDF();
commonTableDF.toJavaRDD().foreach(cmnTableNameRDD->{
DataFrame cmnTableContent=null;
字符串cmnTableName=cmnTableNameRDD.mkString();
if(cmnTableName.contains(“PTR\u security\u t”))
{
cmnTableContent=hiveContext.sql(“从“+cmnTableName+”中选择*,其中fbrn04_snapshot_d=”“+snapshotId+”);
}
其他的
{
cmnTableContent=hiveContext.sql(“选择*自”+cmnTableName);
}
字符串cmnTable=cmnTableName.substring(cmnTableName.lastIndexOf(“.”+1);
如果(cmnTableContent.count()>0)
{
字符串cmnStgTblDir=hdfsPath+“/staging/”+rptName+“/common/”+cmnTable;
JavaRDD cmnTblCntJson=cmnTableContent.toJSON().toJavaRDD();
String result=cmnTblCntJson.reduce((ob1,ob2)->(String)ob1+“,”+(String)ob2);
字符串输出=“[”+结果+“]”;
ArrayList outputList=新的ArrayList();
outputList.add(输出);
JavaRDD finaloutpurdd=sc.parallelize(输出列表);
字符串cmnStgMrgdDir=cmnStgTblDir+“/mergedfile”;
如果(dfs.exists(新路径(cmnStgTblDir+“/mergedfile”))dfs.delete(新路径(cmnStgTblDir+”/mergedfile)),则为true;
coalesce(1).saveAsTextFile(cmnStgMrgdDir,GzipCodec.class);
fileStatus=dfs.getFileStatus(新路径(cmnStgMrgdDir+“/part-00000.gz”);
setPermission(fileStatus.getPath(),FsPermission.createImmutable((short)0770));
重命名(新路径(cmnStgMrgdDir+“/part-00000.gz”)、新路径(CommonPath+“/”+cmnTable+“.json.gz”);
}
其他的
{
System.out.println(“在”+cmnTableName中没有记录);
}
});
}
其他的
{
System.out.println(“公共表列表为空”);
}
sc.停止();
有什么有效的方法可以提高我的性能吗?我认为这更适合您,因为您的代码似乎正常工作,但您希望提高性能。请检查,特别是“问题中提供的代码有什么问题”部分
.reduce((ob1,ob2)->(String)ob1+“,”+(String)ob2)
是个坏主意代码运行正常