Java 如何编写用户定义的聚合函数?
我试图理解JavaSpark文档。有一个名为非类型化用户定义聚合函数的部分,其中有一些示例代码我无法理解。代码如下:Java 如何编写用户定义的聚合函数?,java,apache-spark,apache-spark-sql,Java,Apache Spark,Apache Spark Sql,我试图理解JavaSpark文档。有一个名为非类型化用户定义聚合函数的部分,其中有一些示例代码我无法理解。代码如下: package org.apache.spark.examples.sql; // $example on:untyped_custom_aggregation$ import java.util.ArrayList; import java.util.List; import org.apache.spark.sql.Dataset; import org.apache.s
package org.apache.spark.examples.sql;
// $example on:untyped_custom_aggregation$
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// $example off:untyped_custom_aggregation$
public class JavaUserDefinedUntypedAggregation {
// $example on:untyped_custom_aggregation$
public static class MyAverage extends UserDefinedAggregateFunction {
private StructType inputSchema;
private StructType bufferSchema;
public MyAverage() {
List<StructField> inputFields = new ArrayList<>();
inputFields.add(DataTypes.createStructField("inputColumn", DataTypes.LongType, true));
inputSchema = DataTypes.createStructType(inputFields);
List<StructField> bufferFields = new ArrayList<>();
bufferFields.add(DataTypes.createStructField("sum", DataTypes.LongType, true));
bufferFields.add(DataTypes.createStructField("count", DataTypes.LongType, true));
bufferSchema = DataTypes.createStructType(bufferFields);
}
// Data types of input arguments of this aggregate function
public StructType inputSchema() {
return inputSchema;
}
// Data types of values in the aggregation buffer
public StructType bufferSchema() {
return bufferSchema;
}
// The data type of the returned value
public DataType dataType() {
return DataTypes.DoubleType;
}
// Whether this function always returns the same output on the identical input
public boolean deterministic() {
return true;
}
// Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to
// standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides
// the opportunity to update its values. Note that arrays and maps inside the buffer are still
// immutable.
public void initialize(MutableAggregationBuffer buffer) {
buffer.update(0, 0L);
buffer.update(1, 0L);
}
// Updates the given aggregation buffer `buffer` with new input data from `input`
public void update(MutableAggregationBuffer buffer, Row input) {
if (!input.isNullAt(0)) {
long updatedSum = buffer.getLong(0) + input.getLong(0);
long updatedCount = buffer.getLong(1) + 1;
buffer.update(0, updatedSum);
buffer.update(1, updatedCount);
}
}
// Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
long mergedSum = buffer1.getLong(0) + buffer2.getLong(0);
long mergedCount = buffer1.getLong(1) + buffer2.getLong(1);
buffer1.update(0, mergedSum);
buffer1.update(1, mergedCount);
}
// Calculates the final result
public Double evaluate(Row buffer) {
return ((double) buffer.getLong(0)) / buffer.getLong(1);
}
}
// $example off:untyped_custom_aggregation$
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("Java Spark SQL user-defined DataFrames aggregation example")
.getOrCreate();
// $example on:untyped_custom_aggregation$
// Register the function to access it
spark.udf().register("myAverage", new MyAverage());
Dataset<Row> df = spark.read().json("examples/src/main/resources/employees.json");
df.createOrReplaceTempView("employees");
df.show();
// +-------+------+
// | name|salary|
// +-------+------+
// |Michael| 3000|
// | Andy| 4500|
// | Justin| 3500|
// | Berta| 4000|
// +-------+------+
Dataset<Row> result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees");
result.show();
// +--------------+
// |average_salary|
// +--------------+
// | 3750.0|
// +--------------+
// $example off:untyped_custom_aggregation$
spark.stop();
}
}
package org.apache.spark.examples.sql;
//$example on:untyped_custom_聚合$
导入java.util.ArrayList;
导入java.util.List;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.SparkSession;
导入org.apache.spark.sql.expressions.MutableAggregationBuffer;
导入org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
导入org.apache.spark.sql.types.DataType;
导入org.apache.spark.sql.types.DataTypes;
导入org.apache.spark.sql.types.StructField;
导入org.apache.spark.sql.types.StructType;
//$example off:非类型化\自定义\聚合$
公共类JavaUserDefinedEntityPedAggregation{
//$example on:untyped_custom_聚合$
公共静态类MyAverage扩展了UserDefinedAggregateFunction{
私有结构类型inputSchema;
私有结构类型缓冲模式;
公共收入平均数(){
List inputFields=new ArrayList();
add(DataTypes.createStructField(“inputColumn”,DataTypes.LongType,true));
inputSchema=DataTypes.createStructType(inputFields);
List bufferFields=new ArrayList();
add(DataTypes.createStructField(“sum”,DataTypes.LongType,true));
add(DataTypes.createStructField(“count”,DataTypes.LongType,true));
bufferSchema=DataTypes.createStructType(bufferFields);
}
//此聚合函数的输入参数的数据类型
公共StructType inputSchema(){
返回输入模式;
}
//聚合缓冲区中值的数据类型
公共StructType bufferSchema(){
返回缓冲区模式;
}
//返回值的数据类型
公共数据类型数据类型(){
返回DataTypes.DoubleType;
}
//此函数是否总是在相同的输入上返回相同的输出
公共布尔确定性(){
返回true;
}
//初始化给定的聚合缓冲区。缓冲区本身是除
//标准方法,如在索引处检索值(例如get()、getBoolean()),提供
//更新其值的机会。请注意,缓冲区内的数组和映射仍然是
//不变的。
公共无效初始化(MutableAggregationBuffer){
缓冲区更新(0,0L);
缓冲区更新(1,0升);
}
//使用来自“input”的新输入数据更新给定聚合缓冲区“buffer”`
公共无效更新(MutableAggregationBuffer,行输入){
如果(!input.isNullAt(0)){
long updatedSum=buffer.getLong(0)+input.getLong(0);
long updatedCount=buffer.getLong(1)+1;
update(0,updatedSum);
buffer.update(1,updatedCount);
}
}
//合并两个聚合缓冲区,并将更新的缓冲区值存储回'buffer1'`
公共无效合并(MutableAggregationBuffer1,行buffer2){
long mergedSum=buffer1.getLong(0)+buffer2.getLong(0);
long mergedCount=buffer1.getLong(1)+buffer2.getLong(1);
buffer1.update(0,mergedSum);
buffer1.update(1,mergedCount);
}
//计算最终结果
公共双计算(行缓冲区){
返回((双精度)buffer.getLong(0))/buffer.getLong(1);
}
}
//$example off:非类型化\自定义\聚合$
公共静态void main(字符串[]args){
火花会话火花=火花会话
.builder()
.appName(“Java Spark SQL用户定义数据帧聚合示例”)
.getOrCreate();
//$example on:untyped_custom_聚合$
//注册函数以访问它
spark.udf().register(“myAverage”,newmyaverage());
Dataset df=spark.read().json(“examples/src/main/resources/employees.json”);
df.createOrReplaceTempView(“员工”);
df.show();
// +-------+------+
//|姓名|工资|
// +-------+------+
//|迈克尔| 3000|
//|安迪| 4500|
//|贾斯汀| 3500|
//|贝尔塔| 4000|
// +-------+------+
Dataset result=spark.sql(“选择myAverage(工资)作为员工的平均工资”);
result.show();
// +--------------+
//|平均工资|
// +--------------+
// | 3750.0|
// +--------------+
//$example off:非类型化\自定义\聚合$
spark.stop();
}
}
我对上述代码的疑问如下:
- 每当我想要创建一个UDF时,我应该使用函数
,初始化
和更新
合并
- 变量
和inputSchema
的意义是什么?我很惊讶它们的存在,因为它们从未被用于创建任何数据帧。他们应该出现在每个UDF中吗?如果是,那么它们应该是完全相同的名称吗bufferSchema
- 为什么
和inputSchema
的getter没有命名为bufferSchema
和getInputSchema()
?为什么这些变量没有设置器getBufferSchema()
- 这里称为
的函数的意义是什么?请给出调用此函数有用的场景deterministic()
初始化
、更新
和合并
方法则代表用户定义聚合函数(又称UDAF)
UDF是一种与单行一起工作以(通常)生成一行的函数(例如upper
函数)
UDAF是一个函数
val myUpper = udf { (s: String) => s.toUpperCase }
// Data types of input arguments of this aggregate function
def inputSchema: StructType = StructType(StructField("inputColumn", LongType) :: Nil)
// Data types of values in the aggregation buffer
def bufferSchema: StructType = {
StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
}
// Whether this function always returns the same output on the identical input
def deterministic: Boolean = true