如何在流式查询（Java）中使用JSON数组作为Kafka记录？_Java_Json_Apache Spark_Apache Kafka_Spark Structured Streaming

如何在流式查询（Java）中使用JSON数组作为Kafka记录？

java json apache-spark apache-kafka

如何在流式查询（Java）中使用JSON数组作为Kafka记录？,java,json,apache-spark,apache-kafka,spark-structured-streaming,Java,Json,Apache Spark,Apache Kafka,Spark Structured Streaming,我已经看过许多从Kafa主题读取JSON数据的示例。如果我从每个连接的主题中读取一条记录，我就能够成功地做到这一点，例如： {"customer_id": "8d267162-1478-11ea-8d71-362b9e155667", "product": "Super widget", "price": 10, "bought_date": "2019-01-01" } 以下代码适用于上述用例： package io.examle; import org.apache.spark.s

我已经看过许多从Kafa主题读取JSON数据的示例。如果我从每个连接的主题中读取一条记录，我就能够成功地做到这一点，例如：

{"customer_id": "8d267162-1478-11ea-8d71-362b9e155667",
 "product": "Super widget",
 "price": 10,
 "bought_date": "2019-01-01"
}

以下代码适用于上述用例：

package io.examle;

import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Stackoverflow {

    public static void main(String[] args) throws StreamingQueryException {

        StructType schema = new StructType(new StructField[]{
                new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),  
                new StructField("product", DataTypes.StringType, false, Metadata.empty()),          
                new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),               
                new StructField("bought_date", DataTypes.StringType, false, Metadata.empty()),
            });

        SparkSession spark = SparkSession
                .builder()
                .appName("SimpleExample")
                .getOrCreate();

        // Create a DataSet representing the stream of input lines from Kafka
        Dataset<Row> dataset = spark
                        .readStream()
                        .format("kafka")                
                        .option("kafka.bootstrap.servers", "localhost:9092")
                        .option("subscribe", "utilization")
                        .load()
                        .selectExpr("CAST(value AS STRING) as json");

        dataset.printSchema();

        Column col = new Column("json");

        Dataset<Row> customers = dataset.select(functions.from_json(col,schema).as("data")).select("data.*");           
        customers.printSchema();

        customers.writeStream()      
        .format("console")
        .start()
        .awaitTermination();

    }

}

问题是我无法解压缩JSON数组并对其进行处理。以下代码失败：

package io.example;

import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Stackoverflow {

    public static void main(String[] args) throws StreamingQueryException {

        StructType schema = new StructType(new StructField[]{
                new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),  
                new StructField("product", DataTypes.StringType, false, Metadata.empty()),          
                new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),               
                new StructField("bought_date", DataTypes.StringType, false, Metadata.empty()),
            });

        SparkSession spark = SparkSession
                .builder()
                .appName("SimpleExample")
                .getOrCreate();

        // Create a DataSet representing the stream of input lines from Kafka
        Dataset<Row> dataset = spark
                        .readStream()
                        .format("kafka")                
                        .option("kafka.bootstrap.servers", "localhost:9092")
                        .option("subscribe", "utilization")
                        .load()
                        .selectExpr("CAST(value AS STRING) as json");

        dataset.printSchema();

        Column col = new Column("json");

        Dataset<Row> customers = dataset.select(functions.from_json(col,schema).as("data"));            


        Dataset<Row> data = customers.select(functions.explode_outer(functions.explode_outer(new Column("data"))));
        data.printSchema();

         data.writeStream()      
        .format("console")
        .start()
        .awaitTermination();
    }

}




Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve 'explode(`data`)' due to data type mismatch: input to function explode should be array or map type, not struct<customer_id:string,product:string,price:int,bought_date:string>;;

包io.example；
导入org.apache.spark.sql.Column；
导入org.apache.spark.sql.Dataset；
导入org.apache.spark.sql.Row；
导入org.apache.spark.sql.SparkSession；
导入org.apache.spark.sql.functions；
导入org.apache.spark.sql.streaming.StreamingQueryException；
导入org.apache.spark.sql.types.DataTypes；
导入org.apache.spark.sql.types.Metadata；
导入org.apache.spark.sql.types.StructField；
导入org.apache.spark.sql.types.StructType；
公共类堆栈溢出{
公共静态void main（字符串[]args）抛出StreamingQueryException{
StructType架构=新StructType（新StructField[]{
新的StructField（“customer_id”，DataTypes.StringType，false，Metadata.empty（）），
新建StructField（“产品”，DataTypes.StringType，false，Metadata.empty（）），
新建StructField（“price”，DataTypes.IntegerType，false，Metadata.empty（）），
新建StructField（“购买日期”，DataTypes.StringType，false，Metadata.empty（）），
});
火花会话火花=火花会话
.builder（）
.appName（“SimpleExample”）
.getOrCreate（）；
//创建表示Kafka输入行流的数据集
数据集=spark
.readStream（）
.格式（“卡夫卡”）
.option（“kafka.bootstrap.servers”，“localhost:9092”）
.期权（“认购”、“使用”）
.load（）
.selectExpr（“转换（值为字符串）为json”）；
dataset.printSchema（）；
Column col=新列（“json”）；
Dataset customers=Dataset.select（functions.from_json（col，schema）.as（“data”）；
数据集数据=客户。选择（functions.explode_outer（functions.explode_outer（新列（“数据”）））；
data.printSchema（）；
data.writeStream（）
.格式（“控制台”）
.start（）
.等待终止（）；
}
}
线程“main”org.apache.spark.sql.AnalysisException中出现异常：由于数据类型不匹配，无法解析“explode（`data`）”：函数explode的输入应为数组或映射类型，而不是struct；；

问题:

1）如何正确编写能够有效解压JSON数组的代码？我怀疑我在上面针对失败代码所采取的方法是否是最好的，但我试着遵循我看到的关于functions.explode（）等的许多示例

2）如果失败的代码奇迹般地成为正确的方法。如何将结构转换为数组或映射？

Spark不会为每个连接提取一条记录。卡夫卡API将立即轮询一批记录

就卡夫卡的最佳实践而言，应该将多个事件拆分为多个对象，而不是填充到一个数组中，除非它们确实需要关联，例如，您将有一个带有订单“项目”列表的“购物车”记录

要使代码正常工作，您的模式必须是（而不是结构或映射）

然后在使用from_json时使用数组模式

为了完整起见，下面的代码使用上面的建议实现了所需的结果：

package io.example;


import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Stackoverflow {

    public static void main(String[] args) throws StreamingQueryException {

        StructType schema = new StructType(new StructField[]{
                new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),  
                new StructField("product", DataTypes.StringType, false, Metadata.empty()),          
                new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),               
                new StructField("bought_date", DataTypes.StringType, false, Metadata.empty())
            });

        ArrayType  arrayType = new ArrayType(schema, false);

        SparkSession spark = SparkSession
                .builder()
                .appName("SimpleExample")
                .getOrCreate();

        // Create a DataSet representing the stream of input lines from Kafka
        Dataset<Row> dataset = spark
                        .readStream()
                        .format("kafka")                
                        .option("kafka.bootstrap.servers", "localhost:9092")
                        .option("subscribe", "utilization")
                        .load()
                        .selectExpr("CAST(value AS STRING) as json");

        dataset.printSchema();

        Column col = new Column("json");        
        Column data = functions.from_json(col,arrayType).as("data");    
        Column explode = functions.explode(data);
        Dataset<Row> customers = dataset.select(explode).select("col.*");
        customers.schema();

        customers.writeStream()      
        .format("console")
        .start()
        .awaitTermination();


    }

}


Batch: 77
-------------------------------------------
+-----------+-------------+-----+-----------+
|customer_id|      product|price|bought_date|
+-----------+-------------+-----+-----------+
|   d6315a00| Super widget|   10|2019-01-01 |
|   d6315cd0|  Food widget|    4| 2019-01-01|
|   d6315e2e|  Bike widget|   10| 2019-01-01|
|   d631614e|Garage widget|    4| 2019-01-01|
+-----------+-------------+-----+-----------+

包io.example；
导入org.apache.spark.sql.Column；
导入org.apache.spark.sql.Dataset；
导入org.apache.spark.sql.Row；
导入org.apache.spark.sql.SparkSession；
导入org.apache.spark.sql.functions；
导入org.apache.spark.sql.streaming.StreamingQueryException；
导入org.apache.spark.sql.types.ArrayType；
导入org.apache.spark.sql.types.DataTypes；
导入org.apache.spark.sql.types.Metadata；
导入org.apache.spark.sql.types.StructField；
导入org.apache.spark.sql.types.StructType；
公共类堆栈溢出{
公共静态void main（字符串[]args）抛出StreamingQueryException{
StructType架构=新StructType（新StructField[]{
新的StructField（“customer_id”，DataTypes.StringType，false，Metadata.empty（）），
新建StructField（“产品”，DataTypes.StringType，false，Metadata.empty（）），
新建StructField（“price”，DataTypes.IntegerType，false，Metadata.empty（）），
新建StructField（“购买日期”，DataTypes.StringType，false，Metadata.empty（））
});
ArrayType ArrayType=新的ArrayType（模式，false）；
火花会话火花=火花会话
.builder（）
.appName（“SimpleExample”）
.getOrCreate（）；
//创建表示Kafka输入行流的数据集
数据集=spark
.readStream（）
.格式（“卡夫卡”）
.option（“kafka.bootstrap.servers”，“localhost:9092”）
.期权（“认购”、“使用”）
.load（）
.selectExpr（“转换（值为字符串）为json”）；
dataset.printSchema（）；
Column col=新列（“json”）；
Column data=functions.from_json（col，arrayType）.as（“data”）；
列分解=函数。分解（数据）；
数据集客户=数据集。选择（分解）。选择（“列*”；
customers.schema（）；
customers.writeStream（）
.格式（“控制台”）
.start（）
.等待终止（）；
}
}
批次：77
-------------------------------------------
+-----------+-------------+-----+-----------+
|客户id |产品|价格|购买日期||
+-----------+-------------+-----+-----------+
|d6315a00 |超级wi
StructType schema = new StructType(new StructField[]{
            new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),  
            new StructField("product", DataTypes.StringType, false, Metadata.empty()),          
            new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),               
            new StructField("bought_date", DataTypes.StringType, false, Metadata.empty()),
        });

ArrayType arrSchema = new ArrayType(schema, false);

package io.example;


import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Stackoverflow {

    public static void main(String[] args) throws StreamingQueryException {

        StructType schema = new StructType(new StructField[]{
                new StructField("customer_id", DataTypes.StringType, false, Metadata.empty()),  
                new StructField("product", DataTypes.StringType, false, Metadata.empty()),          
                new StructField("price", DataTypes.IntegerType, false, Metadata.empty()),               
                new StructField("bought_date", DataTypes.StringType, false, Metadata.empty())
            });

        ArrayType  arrayType = new ArrayType(schema, false);

        SparkSession spark = SparkSession
                .builder()
                .appName("SimpleExample")
                .getOrCreate();

        // Create a DataSet representing the stream of input lines from Kafka
        Dataset<Row> dataset = spark
                        .readStream()
                        .format("kafka")                
                        .option("kafka.bootstrap.servers", "localhost:9092")
                        .option("subscribe", "utilization")
                        .load()
                        .selectExpr("CAST(value AS STRING) as json");

        dataset.printSchema();

        Column col = new Column("json");        
        Column data = functions.from_json(col,arrayType).as("data");    
        Column explode = functions.explode(data);
        Dataset<Row> customers = dataset.select(explode).select("col.*");
        customers.schema();

        customers.writeStream()      
        .format("console")
        .start()
        .awaitTermination();


    }

}


Batch: 77
-------------------------------------------
+-----------+-------------+-----+-----------+
|customer_id|      product|price|bought_date|
+-----------+-------------+-----+-----------+
|   d6315a00| Super widget|   10|2019-01-01 |
|   d6315cd0|  Food widget|    4| 2019-01-01|
|   d6315e2e|  Bike widget|   10| 2019-01-01|
|   d631614e|Garage widget|    4| 2019-01-01|
+-----------+-------------+-----+-----------+