Java SpakrSQL使用UUID生成新列
我必须添加值为UUID的新列。我已经使用Spark 1.4 Java和以下代码完成了这项工作Java SpakrSQL使用UUID生成新列,java,apache-spark,apache-spark-sql,bigdata,Java,Apache Spark,Apache Spark Sql,Bigdata,我必须添加值为UUID的新列。我已经使用Spark 1.4 Java和以下代码完成了这项工作 StructType objStructType = inputDataFrame.schema(); StructField []arrStructField=objStructType.fields(); List<StructField> fields = new ArrayList<StructField>(); List&
StructType objStructType = inputDataFrame.schema();
StructField []arrStructField=objStructType.fields();
List<StructField> fields = new ArrayList<StructField>();
List<StructField> newfields = new ArrayList<StructField>();
List <StructField> listFields = Arrays.asList(arrStructField);
StructField a = DataTypes.createStructField(leftCol,DataTypes.StringType, true);
fields.add(a);
newfields.addAll(listFields);
newfields.addAll(fields);
final int size = objStructType.size();
JavaRDD<Row> rowRDD = inputDataFrame.javaRDD().map(new Function<Row, Row>() {
private static final long serialVersionUID = 3280804931696581264L;
public Row call(Row tblRow) throws Exception {
Object[] newRow = new Object[size+1];
int rowSize= tblRow.length();
for (int itr = 0; itr < rowSize; itr++)
{
if(tblRow.apply(itr)!=null)
{
newRow[itr] = tblRow.apply(itr);
}
}
newRow[size] = UUID.randomUUID().toString();
return RowFactory.create(newRow);
}
});
inputDataFrame = objsqlContext.createDataFrame(rowRDD, DataTypes.createStructType(newfields));
StructType objStructType=inputDataFrame.schema();
StructField[]arrStructField=objStructType.fields();
列表字段=新的ArrayList();
List newfields=newarraylist();
List listFields=Arrays.asList(arrcstructfield);
StructField a=DataTypes.createStructField(leftCol,DataTypes.StringType,true);
字段。添加(a);
newfields.addAll(列表字段);
newfields.addAll(字段);
final int size=objStructType.size();
JavaRDD rowRDD=inputDataFrame.JavaRDD().map(新函数(){
私有静态最终长serialVersionUID=3280804931696581264L;
公用行调用(行tblRow)引发异常{
Object[]newRow=新对象[size+1];
int rowSize=tblRow.length();
对于(int-itr=0;itr
我想知道在Spark 2中是否有一些简洁的方法。请提供建议。您可以注册
udf
以获取UUID,并使用callUDF
功能向inputDataFrame
添加新列。请参阅使用Spark 2.0的示例代码
public class SparkUUIDSample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("SparkUUIDSample").master("local[*]").getOrCreate();
//sample input data
List<Tuple2<String, String>> inputList = new ArrayList<Tuple2<String, String>>();
inputList.add(new Tuple2<String, String>("A", "v1"));
inputList.add(new Tuple2<String, String>("B", "v2"));
//dataset
Dataset<Row> df = spark.createDataset(inputList, Encoders.tuple(Encoders.STRING(), Encoders.STRING())).toDF("key", "value");
df.show();
//register udf
UDF1<String, String> uuid = str -> UUID.randomUUID().toString();
spark.udf().register("uuid", uuid, DataTypes.StringType);
//call udf
df.select(col("*"), callUDF("uuid", col("value"))).show();
//stop
spark.stop();
}
}
公共类SparkuidSample{
公共静态void main(字符串[]args){
SparkSession spark=SparkSession.builder().appName(“SparkUIDSample”).master(“local[*]”)。getOrCreate();
//样本输入数据
List inputList=新建ArrayList();
添加(新的元组2(“A”、“v1”);
添加(新的元组2(“B”、“v2”);
//数据集
Dataset df=spark.createDataset(inputList,Encoders.tuple(Encoders.STRING(),Encoders.STRING()).toDF(“key”,“value”);
df.show();
//注册自定义项
UDF1 uuid=str->uuid.randomUUID().toString();
spark.udf().register(“uuid”,uuid,DataTypes.StringType);
//调用udf
df.select(col(“*”),callUDF(“uuid”,col(“value”)).show();
//停止
spark.stop();
}
}