scala：如何像变量一样使用类_Scala_Apache Spark

scala：如何像变量一样使用类

scala apache-spark

scala：如何像变量一样使用类,scala,apache-spark,Scala,Apache Spark,在每次迭代中引用不同的类是可能的吗我有大量的Hadoop配置单元表，并将使用Spark处理它们。每个表都有一个自动生成的类，我希望遍历这些表，而不是首先采用的繁琐的、非代码重用的复制/粘贴/手工编码IndividualTableClassNames技术 import myJavaProject.myTable0Class import myJavaProject.myTable1Class object rawMaxValueSniffer extends Logging { /*

在每次迭代中引用不同的类是可能的吗

我有大量的Hadoop配置单元表，并将使用Spark处理它们。每个表都有一个自动生成的类，我希望遍历这些表，而不是首先采用的繁琐的、非代码重用的复制/粘贴/手工编码IndividualTableClassNames技术

import myJavaProject.myTable0Class
import myJavaProject.myTable1Class

object rawMaxValueSniffer extends Logging {
    /* tedious sequential:  it works, and sometimes a programmer's gotta do... */
    def tedious(args: Array[String]): Unit = {
        val tablePaths = List("path0_string_here","path1_string")
        var maxIds = ArrayBuffer[Long]()

        FileInputFormat.setInputPaths(conf, tablePaths(0))
        AvroReadSupport.setAvroReadSchema(conf.getConfiguration, myTable0Class.getClassSchema)
        ParquetInputFormat.setReadSupportClass(conf, classOf[AvroReadSupport[myTable0Class]])
        val records = sc.newAPIHadoopRDD(conf.getConfiguration,
            classOf[ParquetInputFormat[myTable0Class]],
            classOf[Void],
            classOf[myTable0Class]).map(x => x._2)
        maxIds += records.map(_.getId).collect().max

        FileInputFormat.setInputPaths(conf, tablePaths(1))
        AvroReadSupport.setAvroReadSchema(conf.getConfiguration, myTable1Class.getClassSchema)
        ParquetInputFormat.setReadSupportClass(conf, classOf[AvroReadSupport[myTable1Class]])
        val records = sc.newAPIHadoopRDD(conf.getConfiguration,
            classOf[ParquetInputFormat[myTable1Class]],
            classOf[Void],
            classOf[myTable1Class]).map(x => x._2)
        maxIds += records.map(_.getId).collect().max
    }

    /* class as variable, used in a loop.      I have seen the mountain... */
    def hopedFor(args: Array[String]): Unit = { 
        val tablePaths = List("path0_string_here","path1_string")
        var maxIds = ArrayBuffer[Long]()

        val tableClasses = List(classOf[myTable0Class],classOf[myTable1Class]) /* error free, but does not get me where I'm trying to go */
        var counter=0
        tableClasses.foreach { tc => 
            FileInputFormat.setInputPaths(conf, tablePaths(counter))
            AvroReadSupport.setAvroReadSchema(conf.getConfiguration, tc.getClassSchema)
            ParquetInputFormat.setReadSupportClass(conf, classOf[AvroReadSupport[tc]])
            val records = sc.newAPIHadoopRDD(conf.getConfiguration,
                classOf[ParquetInputFormat[tc]],
                classOf[Void],
                classOf[tc]).map(x => x._2)
                maxIds += records.map(_.getId).collect().max      /* all the myTableXXX classes have  getId()   */
            counter += 1    
        }       
    }
}       

/* the classes being referenced... */
@org.apache.avro.specific.AvroGenerated
public class myTable0Class extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"rsivr_surveyquestiontypes\",\"namespace\":\"myJavaProject\",\"fields\":[{\"name\":\"id\",\"type\":\"in    t\"},{\"name\":\"description\",\"type\":\"st,ing\"},{\"name\":\"scale_range\",\"type\":\"int\"}]}");
  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
  @Deprecated public int id;

  yada.yada.yada0
}

@org.apache.avro.specific.AvroGenerated
public class myTable1Class extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"rsivr_surveyresultdetails\",\"namespace\":\"myJavaProject\",\"fields\":[{\"name\":\"id\",\"type\":\"in    t\"},{\"name\":\"survey_dts\",\"type\":\"string\"},{\"name\":\"survey_id\",\"type\":\"int\"},{\"name\":\"question\",\"type\":\"int\"},{\"name\":\"caller_id\",\"type\":\"string\"},{\"name\":\"rec_msg\",\"type\":\"string\"},{\"name\    ":\"note\",\"type\":\"string\"},{\"name\":\"lang\",\"type\":\"string\"},{\"name\":\"result\",\"type\":\"string\"}]}");
  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
  @Deprecated public int id;

    yada.yada.yada1
}

也许是这样的：

def doStuff[T <: SpecificRecordBase : ClassTag](index: Int, schema: => Schema, clazz: Class[T]) = {
  FileInputFormat.setInputPaths(conf, tablePaths(index))
    AvroReadSupport.setAvroReadSchema(conf.getConfiguration, schema) 
    ParquetInputFormat.setReadSupportClass(conf, classOf[AvroReadSupport[T]])
    val records = sc.newAPIHadoopRDD(conf.getConfiguration,
        classOf[ParquetInputFormat[T]],
        classOf[Void],
        clazz).map(x => x._2)
    maxIds += records.map(_.getId).collect().max
}

Seq(
  (classOf[myTable0Class], myTable0Class.getClassSchema _),
  (classOf[myTable1Class], myTable1Class.getClassSchema _)
).zipWithIndex
.foreach { case ((clazz, schema), index) => doStuff(index, schema, clazz) }

def doStuff[T模式，clazz:Class[T]）={
setInputPath（conf，tablePath（索引））
setavroreadsupport.setAvroReadSchema（conf.getConfiguration，schema）
ParquetInputFormat.setReadSupportClass（conf，classOf[AvroReadSupport[T]]
val records=sc.newAPIHadoopRDD（conf.getConfiguration，
[ParquetInputFormat[T]]的类别，
类别[无效]，
图（x=>x.\u 2）
maxIds+=records.map（u.getId）.collect（）.max
}
序号(
（类别[myTable0Class]，myTable0Class.getClassSchema)，
（类别为[myTable1Class]，myTable1Class.getClassSchema)
)zipWithIndex先生
.foreach{case（（clazz，schema，index）=>doStuff（index，schema，clazz）}

您可以使用反射来调用

getClassSchema

（

clazz.getMethod（“getClassSchema”）.invoke（null）.asInstanceOf[Schema]

），那么您就不需要将其作为参数传递，只需clazz就足够了，但这有点欺骗……我更喜欢这种方法。

记录末尾的映射有两个错误“T没有可用的类标记”和“方法映射的参数不够：（隐式证据$3:scala.reflect.ClassTag[T]）org.apache spark.rdd.rdd[T].未指定的值参数证据$3。“好的，现在应该修复（在类型参数声明中添加了

：ClassTag

。

导入scala.reflect.ClassTag

为了让SpecificRecordBase模式和ClassTag正常工作…我需要3次导入import org.apache.avro.SpecificRecordBase import org.apache.avro.Schema import scala.reflect.ClassTag需要最后一次更改。。。maxIds+=records.map（clazz.getMethod（“getId”）.invoke（z）.asInstanceOf[Int]）.collect（）.max