Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/336.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 如何使用newAPIHadoopFile在spark中读取avro文件?_Java_Hadoop_Apache Spark - Fatal编程技术网

Java 如何使用newAPIHadoopFile在spark中读取avro文件?

Java 如何使用newAPIHadoopFile在spark中读取avro文件?,java,hadoop,apache-spark,Java,Hadoop,Apache Spark,我正在尝试读取spark job中的naAvro文件。 我的spark版本是1.6.0(spark-core_2.10-1.6.0-cdh5.7.1) 以下是我的java代码: JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("ReadAvro")); JavaPairRDD <NullWritable, Text> lines = sc.newAPIHadoopFile(args[0],Av

我正在尝试读取spark job中的na
Avro
文件。
我的spark版本是
1.6.0
(spark-core_2.10-1.6.0-cdh5.7.1)

以下是我的java代码:

JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("ReadAvro"));
JavaPairRDD <NullWritable, Text> lines = sc.newAPIHadoopFile(args[0],AvroKeyValueInputFormat.class,AvroKey.class,AvroValue.class,new Configuration());
JavaSparkContext sc=newjavasparkcontext(newsparkconf().setAppName(“ReadAvro”);
javapairdd line=sc.newAPIHadoopFile(args[0],AvroKeyValueInputFormat.class,AvroKey.class,AvroValue.class,new Configuration());
但我得到了一个编译时异常:

方法newAPIHadoopFile(字符串、类、类、, JavaSparkContext类型中的配置)不适用于 参数(字符串、类、类、, 类、配置)

那么,在Java中使用
JavaSparkContext.newAPIHadoopFile()
的正确方法是什么呢?

公共类UTIL{
public class Utils {

  public static <T> JavaPairRDD<String, T> loadAvroFile(JavaSparkContext sc, String avroPath) {
    JavaPairRDD<AvroKey, NullWritable> records = sc.newAPIHadoopFile(avroPath, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sc.hadoopConfiguration());
    return records.keys()
        .map(x -> (GenericRecord) x.datum())
        .mapToPair(pair -> new Tuple2<>((String) pair.get("key"), (T)pair.get("value")));
  }
}
公共静态JavaPairRDD loadAvroFile(JavaSparkContext sc,字符串avroPath){ javapairdd records=sc.newAPIHadoopFile(avroPath,AvroKeyInputFormat.class,AvroKey.class,nullwriteable.class,sc.hadoopConfiguration()); 返回记录。键() .map(x->(通用记录)x.datum()) .mapToPair(pair->new Tuple2((字符串)pair.get(“key”),(T)pair.get(“value”)); } }
将该实用程序用作:

JavaPairRDD<String, YourAvroClassName> records = Utils.<YourAvroClassName>loadAvroFile(sc, inputDir);
javapairdd records=Utils.loadAvroFile(sc,inputDir);
您可能还需要使用KryoSerializer并注册自定义KryoRegistrator:

sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", "com.test.avro.MyKryoRegistrator");

public class MyKryoRegistrator implements KryoRegistrator {

  public static class SpecificInstanceCollectionSerializer<T extends Collection> extends CollectionSerializer {
    Class<T> type;
    public SpecificInstanceCollectionSerializer(Class<T> type) {
      this.type = type;
    }

    @Override
    protected Collection create(Kryo kryo, Input input, Class<Collection> type) {
      return kryo.newInstance(this.type);
    }

    @Override
    protected Collection createCopy(Kryo kryo, Collection original) {
      return kryo.newInstance(this.type);
    }
  }


  Logger logger = LoggerFactory.getLogger(this.getClass());

  @Override
  public void registerClasses(Kryo kryo) {
    // Avro POJOs contain java.util.List which have GenericData.Array as their runtime type
    // because Kryo is not able to serialize them properly, we use this serializer for them
    kryo.register(GenericData.Array.class, new SpecificInstanceCollectionSerializer<>(ArrayList.class));
    kryo.register(YourAvroClassName.class);
  }
}
sparkConf.set(“spark.serializer”、“org.apache.spark.serializer.KryoSerializer”);
sparkConf.set(“spark.kryo.registor”、“com.test.avro.mykryoregistor”);
公共类mykryoregistor实现了kryoregistor{
公共静态类SpecificInstanceCollectionSerializer扩展了CollectionSerializer{
班级类型;
公共特定InstanceCollectionSerializer(类类型){
this.type=type;
}
@凌驾
创建受保护的集合(Kryo Kryo,输入,类类型){
返回kryo.newInstance(this.type);
}
@凌驾
受保护的收藏createCopy(Kryo Kryo,收藏原件){
返回kryo.newInstance(this.type);
}
}
Logger Logger=LoggerFactory.getLogger(this.getClass());
@凌驾
公共无效登记类别(Kryo Kryo){
//Avro POJO包含java.util.List,其运行时类型为GenericData.Array
//因为Kryo无法正确地序列化它们,所以我们使用这个序列化程序来处理它们
register(GenericData.Array.class,新的SpecificInstanceCollectionSerializer(ArrayList.class));
register(YourAvroClassName.class);
}
}
希望这对…有帮助。

公共类Utils{
公共静态JavaPairRDD loadAvroFile(JavaSparkContext sc,字符串avroPath){
javapairdd records=sc.newAPIHadoopFile(avroPath,AvroKeyInputFormat.class,AvroKey.class,nullwriteable.class,sc.hadoopConfiguration());
返回记录。键()
.map(x->(通用记录)x.datum())
.mapToPair(pair->new Tuple2((字符串)pair.get(“key”),(T)pair.get(“value”));
}
}
将该实用程序用作:

JavaPairRDD<String, YourAvroClassName> records = Utils.<YourAvroClassName>loadAvroFile(sc, inputDir);
javapairdd records=Utils.loadAvroFile(sc,inputDir);
您可能还需要使用KryoSerializer并注册自定义KryoRegistrator:

sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", "com.test.avro.MyKryoRegistrator");

public class MyKryoRegistrator implements KryoRegistrator {

  public static class SpecificInstanceCollectionSerializer<T extends Collection> extends CollectionSerializer {
    Class<T> type;
    public SpecificInstanceCollectionSerializer(Class<T> type) {
      this.type = type;
    }

    @Override
    protected Collection create(Kryo kryo, Input input, Class<Collection> type) {
      return kryo.newInstance(this.type);
    }

    @Override
    protected Collection createCopy(Kryo kryo, Collection original) {
      return kryo.newInstance(this.type);
    }
  }


  Logger logger = LoggerFactory.getLogger(this.getClass());

  @Override
  public void registerClasses(Kryo kryo) {
    // Avro POJOs contain java.util.List which have GenericData.Array as their runtime type
    // because Kryo is not able to serialize them properly, we use this serializer for them
    kryo.register(GenericData.Array.class, new SpecificInstanceCollectionSerializer<>(ArrayList.class));
    kryo.register(YourAvroClassName.class);
  }
}
sparkConf.set(“spark.serializer”、“org.apache.spark.serializer.KryoSerializer”);
sparkConf.set(“spark.kryo.registor”、“com.test.avro.mykryoregistor”);
公共类mykryoregistor实现了kryoregistor{
公共静态类SpecificInstanceCollectionSerializer扩展了CollectionSerializer{
班级类型;
公共特定InstanceCollectionSerializer(类类型){
this.type=type;
}
@凌驾
创建受保护的集合(Kryo Kryo,输入,类类型){
返回kryo.newInstance(this.type);
}
@凌驾
受保护的收藏createCopy(Kryo Kryo,收藏原件){
返回kryo.newInstance(this.type);
}
}
Logger Logger=LoggerFactory.getLogger(this.getClass());
@凌驾
公共无效登记类别(Kryo Kryo){
//Avro POJO包含java.util.List,其运行时类型为GenericData.Array
//因为Kryo无法正确地序列化它们,所以我们使用这个序列化程序来处理它们
register(GenericData.Array.class,新的SpecificInstanceCollectionSerializer(ArrayList.class));
register(YourAvroClassName.class);
}
}

希望这有助于…

在Utils类中仍然存在相同的编译时异常。在Utils类中仍然存在相同的编译时异常。