Java 如何根据数据集的一个属性递归拆分数据集?
我想根据数据集的一个属性(Java 如何根据数据集的一个属性递归拆分数据集?,java,algorithm,dataframe,apache-spark,recursion,Java,Algorithm,Dataframe,Apache Spark,Recursion,我想根据数据集的一个属性(attVal)递归拆分数据集,并基于其他数据集属性值,如下所示:给定初始数据集ds1,以及要用ds2拆分的数据集,ds3,…,我想按min(df2)和max(df2)拆分,min(df3)和max(df3)。。。数据集ds1如下所示: Split by min(df2) and max(df2): min(df1)-----------------------------------------------max(df1) min(df2
attVal
)递归拆分数据集,并基于其他数据集属性值,如下所示:给定初始数据集ds1
,以及要用ds2
拆分的数据集,ds3
,…,我想按min(df2)
和max(df2)
拆分,min(df3)
和max(df3)
。。。数据集ds1
如下所示:
Split by min(df2) and max(df2):
min(df1)-----------------------------------------------max(df1)
min(df2)----------------max(df2)
或按最大值(df2)
拆分:
按最小值(df2)
拆分:
下一次拆分将使用上一步的拆分部分和min(df3)&min(df3)
值(后续步骤使用ds4
…)等)。然后返回所有生成的零件。
为此,我创建了以下Java方法:
public static ArrayList<Dataset<Row>> mySplittingMethod(Dataset<Row> df1, ArrayList<Dataset<Row>> allSplittingDf){
ArrayList<Dataset<Row>> partsOfSplits = null;
long df1Max = df1.select(max(df1.col("attVal"))).first().getLong(0);
long df1Min = df1.select(min(df1.col("attVal"))).first().getLong(0);
for (Dataset<Row> currentDf : allSplittingDf) {
df2 = currentDf;
long df2Max = df2.select(max(df2.col("attVal"))).first().getLong(0);
long df2Min = df2.select(min(df2.col("attVal"))).first().getLong(0);
if (df1Min < df2Min && df2Min < df1Max && df1Max < df2Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Min");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal> df2Min");
mySplittingMethod(secondDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
} else if (df1Min > df2Min && df1Min < df2Max && df2Max < df1Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Max");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal> df2Max");
mySplittingMethod(secondDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
} else if (df1Min > df2Min && df2Max < df1Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Min");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal>= df2Min && attVal<= df2Max");
mySplittingMethod(secondDf,allSplittingDf);
Dataset<Row> thirdDf = df1.where("attVal> df2Max");
mySplittingMethod(thirdDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
partsOfSplits.add(thirdDf);
}
else continue;
}
return partsOfSplits;
}
publicstaticarraylistmysplittingmethod(数据集df1,arraylistallsplittingdf){
ArrayList partsOfSplits=null;
long df1Max=df1.select(max(df1.col(“attVal”)).first().getLong(0);
long df1Min=df1.select(min(df1.col(“attVal”)).first().getLong(0);
对于(数据集currentDf:allSplittingDf){
df2=电流DF;
long df2Max=df2.select(max(df2.col(“attVal”)).first().getLong(0);
long df2Min=df2.select(min(df2.col(“attVal”)).first().getLong(0);
if(df1Mindf2Min&&df1Mindf2Min&&df2Max
我的问题是:从算法上讲,我的算法正确吗?还有没有办法使它更紧凑一点
min(df1)------------------------------------max(df1)
min(df2)----------------------------------max(df2)
public static ArrayList<Dataset<Row>> mySplittingMethod(Dataset<Row> df1, ArrayList<Dataset<Row>> allSplittingDf){
ArrayList<Dataset<Row>> partsOfSplits = null;
long df1Max = df1.select(max(df1.col("attVal"))).first().getLong(0);
long df1Min = df1.select(min(df1.col("attVal"))).first().getLong(0);
for (Dataset<Row> currentDf : allSplittingDf) {
df2 = currentDf;
long df2Max = df2.select(max(df2.col("attVal"))).first().getLong(0);
long df2Min = df2.select(min(df2.col("attVal"))).first().getLong(0);
if (df1Min < df2Min && df2Min < df1Max && df1Max < df2Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Min");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal> df2Min");
mySplittingMethod(secondDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
} else if (df1Min > df2Min && df1Min < df2Max && df2Max < df1Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Max");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal> df2Max");
mySplittingMethod(secondDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
} else if (df1Min > df2Min && df2Max < df1Max)
{
Dataset<Row> firstDf = df1.where("attVal<= df2Min");
mySplittingMethod(firstDf,allSplittingDf);
Dataset<Row> secondDf = df1.where("attVal>= df2Min && attVal<= df2Max");
mySplittingMethod(secondDf,allSplittingDf);
Dataset<Row> thirdDf = df1.where("attVal> df2Max");
mySplittingMethod(thirdDf,allSplittingDf);
partsOfSplits.add(firstDf);
partsOfSplits.add(secondDf);
partsOfSplits.add(thirdDf);
}
else continue;
}
return partsOfSplits;
}