Java 如何编写用户定义的聚合函数？_Java_Apache Spark_Apache Spark Sql

Java 如何编写用户定义的聚合函数？

java apache-spark

Java 如何编写用户定义的聚合函数？,java,apache-spark,apache-spark-sql,Java,Apache Spark,Apache Spark Sql,我试图理解JavaSpark文档。有一个名为非类型化用户定义聚合函数的部分，其中有一些示例代码我无法理解。代码如下： package org.apache.spark.examples.sql; // $example on:untyped_custom_aggregation$ import java.util.ArrayList; import java.util.List; import org.apache.spark.sql.Dataset; import org.apache.s

我试图理解JavaSpark文档。有一个名为非类型化用户定义聚合函数的部分，其中有一些示例代码我无法理解。代码如下：

package org.apache.spark.examples.sql;

// $example on:untyped_custom_aggregation$
import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// $example off:untyped_custom_aggregation$

public class JavaUserDefinedUntypedAggregation {

  // $example on:untyped_custom_aggregation$
  public static class MyAverage extends UserDefinedAggregateFunction {

    private StructType inputSchema;
    private StructType bufferSchema;

    public MyAverage() {
      List<StructField> inputFields = new ArrayList<>();
      inputFields.add(DataTypes.createStructField("inputColumn", DataTypes.LongType, true));
      inputSchema = DataTypes.createStructType(inputFields);

      List<StructField> bufferFields = new ArrayList<>();
      bufferFields.add(DataTypes.createStructField("sum", DataTypes.LongType, true));
      bufferFields.add(DataTypes.createStructField("count", DataTypes.LongType, true));
      bufferSchema = DataTypes.createStructType(bufferFields);
    }
    // Data types of input arguments of this aggregate function
    public StructType inputSchema() {
      return inputSchema;
    }
    // Data types of values in the aggregation buffer
    public StructType bufferSchema() {
      return bufferSchema;
    }
    // The data type of the returned value
    public DataType dataType() {
      return DataTypes.DoubleType;
    }
    // Whether this function always returns the same output on the identical input
    public boolean deterministic() {
      return true;
    }
    // Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to
    // standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides
    // the opportunity to update its values. Note that arrays and maps inside the buffer are still
    // immutable.
    public void initialize(MutableAggregationBuffer buffer) {
      buffer.update(0, 0L);
      buffer.update(1, 0L);
    }
    // Updates the given aggregation buffer `buffer` with new input data from `input`
    public void update(MutableAggregationBuffer buffer, Row input) {
      if (!input.isNullAt(0)) {
        long updatedSum = buffer.getLong(0) + input.getLong(0);
        long updatedCount = buffer.getLong(1) + 1;
        buffer.update(0, updatedSum);
        buffer.update(1, updatedCount);
      }
    }
    // Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
    public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
      long mergedSum = buffer1.getLong(0) + buffer2.getLong(0);
      long mergedCount = buffer1.getLong(1) + buffer2.getLong(1);
      buffer1.update(0, mergedSum);
      buffer1.update(1, mergedCount);
    }
    // Calculates the final result
    public Double evaluate(Row buffer) {
      return ((double) buffer.getLong(0)) / buffer.getLong(1);
    }
  }
  // $example off:untyped_custom_aggregation$

  public static void main(String[] args) {
    SparkSession spark = SparkSession
      .builder()
      .appName("Java Spark SQL user-defined DataFrames aggregation example")
      .getOrCreate();

    // $example on:untyped_custom_aggregation$
    // Register the function to access it
    spark.udf().register("myAverage", new MyAverage());

    Dataset<Row> df = spark.read().json("examples/src/main/resources/employees.json");
    df.createOrReplaceTempView("employees");
    df.show();
    // +-------+------+
    // |   name|salary|
    // +-------+------+
    // |Michael|  3000|
    // |   Andy|  4500|
    // | Justin|  3500|
    // |  Berta|  4000|
    // +-------+------+

    Dataset<Row> result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees");
    result.show();
    // +--------------+
    // |average_salary|
    // +--------------+
    // |        3750.0|
    // +--------------+
    // $example off:untyped_custom_aggregation$

    spark.stop();
  }
}

package org.apache.spark.examples.sql；
//$example on:untyped_custom_聚合$
导入java.util.ArrayList；
导入java.util.List；
导入org.apache.spark.sql.Dataset；
导入org.apache.spark.sql.Row；
导入org.apache.spark.sql.SparkSession；
导入org.apache.spark.sql.expressions.MutableAggregationBuffer；
导入org.apache.spark.sql.expressions.UserDefinedAggregateFunction；
导入org.apache.spark.sql.types.DataType；
导入org.apache.spark.sql.types.DataTypes；
导入org.apache.spark.sql.types.StructField；
导入org.apache.spark.sql.types.StructType；
//$example off：非类型化\自定义\聚合$
公共类JavaUserDefinedEntityPedAggregation{
//$example on:untyped_custom_聚合$
公共静态类MyAverage扩展了UserDefinedAggregateFunction{
私有结构类型inputSchema；
私有结构类型缓冲模式；
公共收入平均数（）{
List inputFields=new ArrayList（）；
add（DataTypes.createStructField（“inputColumn”，DataTypes.LongType，true））；
inputSchema=DataTypes.createStructType（inputFields）；
List bufferFields=new ArrayList（）；
add（DataTypes.createStructField（“sum”，DataTypes.LongType，true））；
add（DataTypes.createStructField（“count”，DataTypes.LongType，true））；
bufferSchema=DataTypes.createStructType（bufferFields）；
}
//此聚合函数的输入参数的数据类型
公共StructType inputSchema（）{
返回输入模式；
}
//聚合缓冲区中值的数据类型
公共StructType bufferSchema（）{
返回缓冲区模式；
}
//返回值的数据类型
公共数据类型数据类型（）{
返回DataTypes.DoubleType；
}
//此函数是否总是在相同的输入上返回相同的输出
公共布尔确定性（）{
返回true；
}
//初始化给定的聚合缓冲区。缓冲区本身是除
//标准方法，如在索引处检索值（例如get（）、getBoolean（）），提供
//更新其值的机会。请注意，缓冲区内的数组和映射仍然是
//不变的。
公共无效初始化（MutableAggregationBuffer）{
缓冲区更新（0，0L）；
缓冲区更新（1，0升）；
}
//使用来自“input”的新输入数据更新给定聚合缓冲区“buffer”`
公共无效更新（MutableAggregationBuffer，行输入）{
如果（！input.isNullAt（0））{
long updatedSum=buffer.getLong（0）+input.getLong（0）；
long updatedCount=buffer.getLong（1）+1；
update（0，updatedSum）；
buffer.update（1，updatedCount）；
}
}
//合并两个聚合缓冲区，并将更新的缓冲区值存储回'buffer1'`
公共无效合并（MutableAggregationBuffer1，行buffer2）{
long mergedSum=buffer1.getLong（0）+buffer2.getLong（0）；
long mergedCount=buffer1.getLong（1）+buffer2.getLong（1）；
buffer1.update（0，mergedSum）；
buffer1.update（1，mergedCount）；
}
//计算最终结果
公共双计算（行缓冲区）{
返回（（双精度）buffer.getLong（0））/buffer.getLong（1）；
}
}
//$example off：非类型化\自定义\聚合$
公共静态void main（字符串[]args）{
火花会话火花=火花会话
.builder（）
.appName（“Java Spark SQL用户定义数据帧聚合示例”）
.getOrCreate（）；
//$example on:untyped_custom_聚合$
//注册函数以访问它
spark.udf（）.register（“myAverage”，newmyaverage（））；
Dataset df=spark.read（）.json（“examples/src/main/resources/employees.json”）；
df.createOrReplaceTempView（“员工”）；
df.show（）；
// +-------+------+
//|姓名|工资|
// +-------+------+
//|迈克尔| 3000|
//|安迪| 4500|
//|贾斯汀| 3500|
//|贝尔塔| 4000|
// +-------+------+
Dataset result=spark.sql（“选择myAverage（工资）作为员工的平均工资”）；
result.show（）；
// +--------------+
//|平均工资|
// +--------------+
// |        3750.0|
// +--------------+
//$example off：非类型化\自定义\聚合$
spark.stop（）；
}
}

我对上述代码的疑问如下：

每当我想要创建一个UDF时，我应该使用函数
```
初始化
```
，
```
更新
```
和
```
合并
```
变量
```
inputSchema
```
和
```
bufferSchema
```
的意义是什么？我很惊讶它们的存在，因为它们从未被用于创建任何数据帧。他们应该出现在每个UDF中吗？如果是，那么它们应该是完全相同的名称吗
为什么
```
inputSchema
```
和
```
bufferSchema
```
的getter没有命名为
```
getInputSchema（）
```
和
```
getBufferSchema（）
```
？为什么这些变量没有设置器
这里称为
```
deterministic（）
```
的函数的意义是什么？请给出调用此函数有用的场景

一般来说，我想知道如何在Spark中编写用户定义的聚合函数

每当我想要创建一个UDF时，我应该让函数初始化、更新和合并吗

UDF代表用户定义函数，而

初始化

、

更新

和

合并

方法则代表用户定义聚合函数（又称UDAF）

UDF是一种与单行一起工作以（通常）生成一行的函数（例如

upper

函数）

UDAF是一个函数

val myUpper = udf { (s: String) => s.toUpperCase }

// Data types of input arguments of this aggregate function
def inputSchema: StructType = StructType(StructField("inputColumn", LongType) :: Nil)

// Data types of values in the aggregation buffer
def bufferSchema: StructType = {
  StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
}

// Whether this function always returns the same output on the identical input
def deterministic: Boolean = true