Spark java问题使用java.util.Map类型创建行
使用spark 2.1 我创建了一个包含MapDataType的数据集Spark java问题使用java.util.Map类型创建行,java,apache-spark,apache-spark-sql,Java,Apache Spark,Apache Spark Sql,使用spark 2.1 我创建了一个包含MapDataType的数据集 StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("words", DataTypes.St
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("words", DataTypes.StringType, false, Metadata.empty()),
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("features", DataTypes.createMapType(DataTypes.StringType, DataTypes.IntegerType), false, Metadata.empty())
});
Map<String,Integer> abc = new HashMap<String,Integer>();
abc.put("abc", 1);
Row r = RowFactory.create(0, "Hi these are words ", 1, abc);
List<Row> data = Arrays.asList(r);
Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);
wordDataFrame.show();
StructType架构=新的StructType(新的StructField[]{
新建StructField(“id”,DataTypes.IntegerType,false,Metadata.empty()),
new StructField(“words”,DataTypes.StringType,false,Metadata.empty()),
新建StructField(“label”,DataTypes.IntegerType,false,Metadata.empty()),
新建StructField(“features”、DataTypes.createMapType(DataTypes.StringType、DataTypes.IntegerType)、false、Metadata.empty()
});
Map abc=新的HashMap();
abc.put(“abc”,1);
行r=行工厂。创建(0,“您好,这些是单词”,1,abc);
列表数据=数组.asList(r);
Dataset wordDataFrame=spark.createDataFrame(数据,模式);
show();
上述代码工作正常
但是当我尝试调用这个数据集上的map函数(用新的HashMap替换map数据类型条目)时,我得到了以下错误
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("words", DataTypes.StringType, false, Metadata.empty()),
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("featuresNew", DataTypes.createMapType(DataTypes.StringType, DataTypes.IntegerType), false, Metadata.empty())
});
ExpressionEncoder<Row> encoder = RowEncoder.apply(schema);
Dataset<Row> output = input.map(new MapFunction<Row, Row>() {
@Override
public Row call(Row row) throws Exception {
Map<String, Integer> newMap = new HashMap<String, Integer>();
newMap.put("Transformed string", 1);
return RowFactory.create(row.getInt(0), row.getString(1), row.getInt(2), newMap);
}
}, encoder);
return output;
StructType架构=新的StructType(新的StructField[]{
新建StructField(“id”,DataTypes.IntegerType,false,Metadata.empty()),
new StructField(“words”,DataTypes.StringType,false,Metadata.empty()),
新建StructField(“label”,DataTypes.IntegerType,false,Metadata.empty()),
新建StructField(“featuresNew”、DataTypes.createMapType(DataTypes.StringType、DataTypes.IntegerType)、false、Metadata.empty()
});
expressionEncoderEncoder=RowEncoder.apply(模式);
数据集输出=input.map(新的MapFunction(){
@凌驾
公用行调用(行)引发异常{
Map newMap=newhashmap();
newMap.put(“转换字符串”,1);
返回RowFactory.create(row.getInt(0)、row.getString(1)、row.getInt(2)、newMap);
}
},编码器);
返回输出;
错误堆栈:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.RuntimeException: java.util.HashMap is not a valid external type for schema of map<string,int>
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:410)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
线程“main”org.apache.spark.sparkeexception中的异常:作业因阶段失败而中止:阶段0.0中的任务0失败1次,最近的失败:阶段0.0中的任务0.0丢失(TID 0,localhost,executor driver):java.lang.RuntimeException:java.util.HashMap不是映射架构的有效外部类型
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(generated.java:410)
位于org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
位于org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:377)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:826)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:826)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:323)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:287)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
位于org.apache.spark.scheduler.Task.run(Task.scala:99)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:282)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
运行(Thread.java:745)
我错过了什么?为什么会出现“java.util.HashMap不是映射架构的有效外部类型”错误
编辑:
我尝试了java.util.List数据类型
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("words", DataTypes.StringType, false, Metadata.empty()),
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("featuresNew", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
});
ExpressionEncoder<Row> encoder = RowEncoder.apply(schema);
Dataset<Row> output = input.map(new MapFunction<Row, Row>() {
@Override
public Row call(Row row) throws Exception {
List<String> xyz = Arrays.asList("Hi", "how", "now");
return RowFactory.create(row.getInt(0), row.getString(1), row.getInt(2), xyz);
}
}, encoder);
StructType架构=新的StructType(新的StructField[]{
新建StructField(“id”,DataTypes.IntegerType,false,Metadata.empty()),
new StructField(“words”,DataTypes.StringType,false,Metadata.empty()),
新建StructField(“label”,DataTypes.IntegerType,false,Metadata.empty()),
新建StructField(“featuresNew”、DataTypes.createArrayType(DataTypes.StringType)、false、Metadata.empty()
});
expressionEncoderEncoder=RowEncoder.apply(模式);
数据集输出=input.map(新的MapFunction(){
@凌驾
公用行调用(行)引发异常{
List xyz=Arrays.asList(“Hi”、“how”、“now”);
返回RowFactory.create(row.getInt(0)、row.getString(1)、row.getInt(2)、xyz);
}
},编码器);
我得到一个类似的错误消息
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.RuntimeException: java.util.Arrays$ArrayList is not a valid external type for schema of array<string>
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:221)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
线程“main”org.apache.spark.sparkeexception中的异常:作业因阶段失败而中止:阶段0.0中的任务0失败1次,最近的失败:阶段0.0中的任务0.0丢失(TID 0,localhost,executor driver):java.lang.RuntimeException:java.util.Arrays$ArrayList不是数组架构的有效外部类型
位于org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(generated.java:221)
位于org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
位于org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:377)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
位于org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:826)
位于org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:826)
在org.apache.spark.rdd.MapPartitionsRDD.compute上(MapPartitionsRDD.scala:38)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:323)上
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:287)
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("words", DataTypes.StringType, false, Metadata.empty()),
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("featuresNew", DataTypes.StringType, false, Metadata.empty())
});
ExpressionEncoder<Row> encoder = RowEncoder.apply(schema);
Dataset<Row> output = input.map(new MapFunction<Row, Row>() {
@Override
public Row call(Row row) throws Exception {
String xyz = Arrays.asList("Please", "work", "now").toString();
return RowFactory.create(row.getInt(0), row.getString(1), row.getInt(2), xyz);
}
}, encoder);
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("words", DataTypes.StringType, false, Metadata.empty()),
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("featuresNew", DataTypes.createMapType(DataTypes.StringType, DataTypes.IntegerType), false, Metadata.empty())
});
ExpressionEncoder<Row> encoder = RowEncoder.apply(schema);
Dataset<Row> output = input.map(new MapFunction<Row, Row>() {
@Override
public Row call(Row row) throws Exception {
HashMap<String, Integer> newMap = new HashMap<String,Integer();
newMap.put("Transformed string", 1);
return RowFactory.create(row.getInt(0), row.getString(1), row.getInt(2), ToScalaExample.toScalaMap(newMap));
}
}, encoder);
return output;
scala.collection.Map<Object, Object> map = row.getMap(3);
JavaConverters.mapAsScalaMapConverter(newMap).asScala();