Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/340.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java Spark数据集筛选失败的记录,写入s3失败,出现错误_Java_Apache Spark - Fatal编程技术网

Java Spark数据集筛选失败的记录,写入s3失败,出现错误

Java Spark数据集筛选失败的记录,写入s3失败,出现错误,java,apache-spark,Java,Apache Spark,我有一个数据集,其中传入的数据集是一个数据集,每一行用于调用API,如果在API中找不到请求的资源,我希望捕获异常并收集所有失败的行 这是代码 Dataset<FailedModel> failedDataset = dataset.map( (MapFunction<Row, FailedModel>) row -> { FailedModel failedModel = new FailedModel();

我有一个数据集,其中传入的数据集是一个数据集,每一行用于调用API,如果在API中找不到请求的资源,我希望捕获异常并收集所有失败的行

这是代码

Dataset<FailedModel> failedDataset = dataset.map(
          (MapFunction<Row, FailedModel>) row -> {
 FailedModel failedModel = new FailedModel();
           
            try {

            } catch(Exception e) {
                failedModel.setRow(row);
                failedModel.setFailureRootCause(e.getMessage());
            }
                   

   return failedModel;
          },Encoders.bean(FailedModel.class)).filter(Objects::nonNull).persist(StorageLevel.MEMORY_ONLY());
Dataset failedDataset=Dataset.map(
(映射函数)行->{
FailedModel FailedModel=新的FailedModel();
试一试{
}捕获(例外e){
failedModel.setRow(行)失败;
failedModel.setFailureRootCause(e.getMessage());
}
返回失败模型;
},Encoders.bean(FailedModel.class)).filter(Objects::nonNull).persist(StorageLevel.MEMORY_ONLY());
现在我想将失败的记录写入s3

 long failedRecs = failedDataset.distinct().filter(x -> StringUtils.isNotBlank(x.getFailureRootCause()) && x.getRow()!=null).count();

Dataset<Row> failedRowsDf = failedDataset.filter(
        x -> x != null && x.getRow() != null && !StringUtils.isBlank(x.getFailureRootCause()))
        .map((MapFunction<FailedModel, Row>) value -> value.getRow(),Encoders.bean(Row.class)).persist(StorageLevel.MEMORY_ONLY());


    Dataset<Row> failedRecordsDF = session.sqlContext().createDataFrame(failedRowsDf.rdd(),getSchema()).persist(StorageLevel.MEMORY_ONLY());
failedRecordsDF.na().drop().write().mode(SaveMode.Overwrite).save("s3://databucket/failedRecords/");


   Dataset<String> rootcauseDF = failedDataset.filter(
        x -> x != null && x.getRow()!= null && !StringUtils.isBlank(x.getFailureRootCause()))
        .map((MapFunction<FailedModel, String>) value -> value.getFailureRootCause(),Encoders.STRING()).persist(StorageLevel.MEMORY_ONLY());

 rootcauseDF.na().drop().write().mode(SaveMode.Overwrite).csv("s3://databucket/reports_rootcause/"); 

**//This fails with the reason ERROR org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 152, Column 8: Cannot instantiate "org.apache.spark.sql.Row"
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 152, Column 8: Cannot instantiate "org.apache.spark.sql.Row"**



private StructType getSchema() {

    return new StructType()
        .add("Person_Name", "string", true)
        .add("Person_Age", "string", true);
 }


public class FailedModel implements Serializable{

  private Row row;
  private String failureRootCause;

 
  public void setRow(Row row) {
    this.row = row;
  }

 public Row getRow() {
    return row;
  }

  public String getFailureRootCause() {
    return failureRootCause;
  }

  public void setFailureRootCause(String failureRootCause) {
    this.failureRootCause = failureRootCause;
  }
}```
long failedRecs=failedDataset.distinct().filter(x->StringUtils.isNotBlank(x.getFailureRootCause())和&x.getRow()!=null.count();
数据集failedRowsDf=failedDataset.filter(
x->x!=null&&x.getRow()!=null&&StringUtils.isBlank(x.getFailureRootCause())
.map((MapFunction)value->value.getRow(),Encoders.bean(Row.class)).persist(StorageLevel.MEMORY_ONLY());
Dataset failedRecordsDF=session.sqlContext().createDataFrame(failedRowsDf.rdd(),getSchema()).persist(仅限StorageLevel.MEMORY_());
failedRecordsDF.na().drop().write().mode(SaveMode.Overwrite).save(“s3://databucket/failedRecords/”);
Dataset rootcauseDF=failedDataset.filter(
x->x!=null&&x.getRow()!=null&&StringUtils.isBlank(x.getFailureRootCause())
.map((MapFunction)value->value.getFailureRootCause(),Encoders.STRING()).persist(仅限StorageLevel.MEMORY_());
rootcauseDF.na().drop().write().mode(SaveMode.Overwrite).csv(“s3://databucket/reports\u rootcause/”;
**//此操作失败,原因是错误org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator-未能编译:org.codehaus.commons.compiler.compileeException:文件“generated.java”,第152行,第8列:无法实例化“org.apache.spark.sql.Row”
org.codehaus.commons.compiler.CompileException:文件'generated.java',第152行,第8列:无法实例化“org.apache.spark.sql.Row”**
私有StructType getSchema(){
返回新的StructType()
.add(“人名”、“字符串”,true)
.添加(“人员年龄”、“字符串”,真);
}
公共类FailedModel实现可序列化{
私人行;
私有字符串失败的根本原因;
公共无效集合行(行){
this.row=行;
}
公共行getRow(){
返回行;
}
公共字符串getFailureRootCause(){
返回失败的根本原因;
}
public void setFailureRootCause(字符串failureRootCause){
this.failureRootCause=failureRootCause;
}
}```