使用产品接口扩展scala类以克服spark shell中22个字段的限制时遇到错误_Scala_Apache Spark_Apache Spark Sql

使用产品接口扩展scala类以克服spark shell中22个字段的限制时遇到错误

scala apache-spark

使用产品接口扩展scala类以克服spark shell中22个字段的限制时遇到错误,scala,apache-spark,apache-spark-sql,Scala,Apache Spark,Apache Spark Sql,我需要创建一个类模式来支持29个字段。由于case类有22个字段的限制，我尝试用产品接口扩展我的类“sdp_d”，如下所示： class sdp_d( WID :Option[Int], BATCH_ID :Option[Int], SRC_ID :Option[String], ORG_ID :Option[Int], CLASS_WID :Option[Int], DESC_TEXT :Option[String], PREMISE_WID :Option[Int], FEED_LOC :

我需要创建一个类模式来支持29个字段。由于case类有22个字段的限制，我尝试用产品接口扩展我的类“sdp_d”，如下所示：

class sdp_d( WID :Option[Int], BATCH_ID :Option[Int], SRC_ID :Option[String], ORG_ID :Option[Int], CLASS_WID :Option[Int],  DESC_TEXT :Option[String], PREMISE_WID :Option[Int], FEED_LOC :Option[String], GPS_LAT :Option[Double], GPS_LONG :Option[Double], PULSE_OUTPUT_BLOCK :Option[String], UDC_ID :Option[String], UNIVERSAL_ID :Option[String], IS_VIRTUAL_FLG :Option[String], SEAL_INFO :Option[String], ACCESS_INFO :Option[String], ALT_ACCESS_INFO :Option[String], LOC_INFO :Option[String], ALT_LOC_INFO :Option[String], TYPE :Option[String], SUB_TYPE :Option[String], TIMEZONE_ID :Option[Int], GIS_ID :Option[String], BILLED_UPTO_TIME :Option[java.sql.Timestamp], POWER_STATUS :Option[String], LOAD_STATUS :Option[String], BILLING_HOLD_STATUS :Option[String], INSERT_TIME :Option[java.sql.Timestamp], LAST_UPD_TIME :Option[java.sql.Timestamp]) extends Product{

@throws(classOf[IndexOutOfBoundsException])
override def productElement(n: Int) = n match 
{
    case 0 => WID
    case 1 => BATCH_ID
    case 2 => SRC_ID
    case 3 => ORG_ID
    case 4 => CLASS_WID
    case 5 => DESC_TEXT
    case 6 => PREMISE_WID
    case 7 => FEED_LOC
    case 8 => GPS_LAT
    case 9 => GPS_LONG
    case 10 => PULSE_OUTPUT_BLOCK
    case 11 => UDC_ID
    case 12 => UNIVERSAL_ID
    case 13 => IS_VIRTUAL_FLG
    case 14 => SEAL_INFO
    case 15 => ACCESS_INFO
    case 16 => ALT_ACCESS_INFO
    case 17 => LOC_INFO
    case 18 => ALT_LOC_INFO
    case 19 => TYPE
    case 20 => SUB_TYPE
    case 21 => TIMEZONE_ID
    case 22 => GIS_ID
    case 23 => BILLED_UPTO_TIME
    case 24 => POWER_STATUS
    case 25 => LOAD_STATUS
    case 26 => BILLING_HOLD_STATUS
    case 27 => INSERT_TIME
    case 28 => LAST_UPD_TIME
    case _ => throw new IndexOutOfBoundsException(n.toString())
}

override def productArity: Int = 29
override def canEqual(that: Any): Boolean = that.isInstanceOf[sdp_d]

}

这定义了“sdp_d”类。但是，当我尝试使用此预定义架构将csv数据加载到中并将其注册为表时，我遇到一个错误：

> scala> import java.text.SimpleDateFormat; val sdf = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.S"); import java.util.Calendar; import java.util.Date; val calendar = Calendar.getInstance()
import java.text.SimpleDateFormat
sdf: java.text.SimpleDateFormat = java.text.SimpleDateFormat@cce61785
import java.util.Calendar
import java.util.Date
calendar: java.util.Calendar = java.util.GregorianCalendar[time=1424687963209,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Asia/Kolkata",offset=19800000,dstSavings=0,useDaylight=false,transitions=6,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2015,MONTH=1,WEEK_OF_YEAR=9,WEEK_OF_MONTH=4,DAY_OF_MONTH=23,DAY_OF_YEAR=54,DAY_OF_WEEK=2,DAY_OF_WEEK_IN_MONTH=4,AM_PM=1,HOUR=4,HOUR_OF_DAY=16,MINUTE=9,SECOND=23,MILLISECOND=209,ZONE_OFFSET=19800000,DST_OFFSET=0]

    > scala> sc.textFile("hdfs://CDH-Master-1.cdhcluster/user/spark/Sdp_d.csv").map(_.split(",")).map { r =>
         | val upto_time = sdf.parse(r(23).trim);
         | calendar.setTime(upto_time); 
         | val r23 = new java.sql.Timestamp(upto_time.getTime); 
         | 
         | val insert_time = sdf.parse(r(26).trim); 
         | calendar.setTime(insert_time); 
         | val r26 = new java.sql.Timestamp(insert_time.getTime); 
         | 
         | val last_upd_time = sdf.parse(r(27).trim);
         | calendar.setTime(last_upd_time); 
         | val r27 = new java.sql.Timestamp(last_upd_time.getTime); 
         | 
         | sdp_d(r(0).trim.toInt, r(1).trim.toInt, r(2).trim, r(3).trim.toInt, r(4).trim.toInt, r(5).trim, r(6).trim.toInt, r(7).trim, r(8).trim.toDouble, r(9).trim.toDouble, r(10).trim, r(11).trim, r(12).trim, r(13).trim, r(14).trim, r(15).trim, r(16).trim, r(17).trim, r(18).trim, r(19).trim, r(20).trim, r(21).trim.toInt, r(22).trim, r23, r(24).trim, r(25).trim, r26, r27, r(28).trim)
         | }.registerAsTable("sdp")
    <console>:36: error: not found: value sdp_d
                  sdp_d(r(0).trim.toInt, r(1).trim.toInt, r(2).trim, r(3).trim.toInt, r(4).trim.toInt, r(5).trim, r(6).trim.toInt, r(7).trim, r(8).trim.toDouble, r(9).trim.toDouble, r(10).trim, r(11).trim, r(12).trim, r(13).trim, r(14).trim, r(15).trim, r(16).trim, r(17).trim, r(18).trim, r(19).trim, r(20).trim, r(21).trim.toInt, r(22).trim, r23, r(24).trim, r(25).trim, r26, r27, r(28).trim)
                  ^`

>scala>导入java.text.simpleDataFormat；val sdf=新的简化格式（“yyyy-mm-dd hh:mm:ss.S”）；导入java.util.Calendar；导入java.util.Date；val calendar=calendar.getInstance（）
导入java.text.simpleDataFormat
sdf:java.text.simpleDataFormat=java.text。SimpleDateFormat@cce61785
导入java.util.Calendar
导入java.util.Date
calendar:java.util.calendar=java.util.GregorianCalendar[time=1424687963209，areFieldsSet=true，areAllFieldsSet=true，lenient=true，zone=sun.util.calendar.ZoneInfo[id=“Asia/Kolkata”，offset=19800000，dstSaves=0，useDaylight=false，transitions=6，lastRule=null]，周的第一天=1，周的第一天=1，纪元=1，年=2015，月=1，年的第周=9，月的第周=4，月的第日=23，年的第日=54，周的第日=2，周的第日=4，上午下午=1，小时=4，日的第小时=16，分钟=9，秒=23，毫秒=209，分区偏移=19800000，DST偏移=0]
>scala>sc.textFile（“hdfs://CDH-Master-1.cdhcluster/user/spark/Sdp_d.csv.map（u.split（“，”）.map{r=>
|val upto_time=sdf.parse（r（23.trim））；
|日历设置时间（截止时间）；
|val r23=new java.sql.Timestamp（upto_time.getTime）；
| 
|val insert_time=sdf.parse（r（26）.trim）；
|日历设置时间（插入时间）；
|val r26=newjava.sql.Timestamp（insert_time.getTime）；
| 
|val last_upd_time=sdf.parse（r（27.trim））；
|日历设置时间（上次更新时间）；
|val r27=newjava.sql.Timestamp（last_upd_time.getTime）；
| 
|特姆，r（3）特姆，r（4）特姆，r（5）特姆，r（6）特姆，r（7）特姆，r（8）特姆，r（1）特姆，r（9）特姆，r（10）特姆，r（11）特姆，r（12）特姆，r（13）特姆，r（14）特姆，r（15）特姆，r（16）特姆，r（17）特姆，r（18）特姆，r（19）特姆，r（9）特姆，特姆，r（10）特姆，r（11）特姆，特姆，r（12）特姆，r（12）特姆，r（13）特姆，特姆，r（14）特姆，r（14）特姆，r（14）特姆，r（14）特姆，r（17）特姆，r（17）特姆，r（17）特姆，r（18）特姆.饰件，r26，r27，r（28）.饰件
|}.registerAsTable（“sdp”）
：36:错误：未找到：值sdp\U d
特姆，r（3）特姆，r（4）特姆，r（5）特姆，r（6）特姆，r（7）特姆，r（8）特姆，r（1）特姆，r（9）特姆，r（10）特姆，r（11）特姆，r（12）特姆，r（13）特姆，r（14）特姆，r（15）特姆，r（16）特姆，r（17）特姆，r（18）特姆，r（19）特姆，r（9）特姆，特姆，r（10）特姆，r（11）特姆，特姆，r（12）特姆，r（12）特姆，r（13）特姆，特姆，r（14）特姆，r（14）特姆，r（14）特姆，r（14）特姆，r（17）特姆，r（17）特姆，r（17）特姆，r（18）特姆.饰件，r26，r27，r（28）.饰件
^`

我在spark shell工作。Spark版本1.1.0和scala版本2.10.4

我不明白为什么会出现错误：未找到：值sdp\d

当我创建自己的类扩展产品接口时，我应该如何注册可删除性

请帮助解决此错误。

您只需使用

new

实例化该类即可：

new sdp_d(r(0).trim.toInt, r(1).trim.toInt, ...

您是否碰巧看过

您可以：

用new关键字实例化

new sdp\u d（…）

您将字段声明为Option[T]，例如Option[Int]，因此我们需要将Option[T]作为参数传递（部分或无）。

new sdp\u d（Try（r（0）.trim.toInt）.toOption，Try（r（1）.trim.toInt）.toOption，l（2）.trim.toOption，…）

这对我很有用：

//AirTraffic.scala
        class AirTraffic(Year:Option[Int], Month:Option[Int], DayOfMonth:Option[Int], DayOfWeek:Option[Int],
                     DepTime:Option[Int], CRSDepTime:Option[Int], ArrTime:Option[Int], CRSArrTime:Option[Int],
                     UniqueCarrier:String, FlightNum:Option[Int], TailNum:String, ActualElapsedTime:Option[Int],
                     CRSElapsedTime:Option[Int], AirTime:Option[Int], ArrDelay:Option[Int], DepDelay:Option[Int],
                     Origin:String, Dest:String, Distance:Option[Int], TaxiIn:Option[Int], TaxiOut:Option[Int],
                     Cancelled:Option[Boolean], CancellationCode:String, Diverted:Option[Boolean], CarrierDelay:Option[Int],
                     WeatherDelay:Option[Int], NASDelay:Option[Int], SecurityDelay:Option[Int], LateAircraftDelay:Option[Int]) extends Product {

      // We declare field with Option[T] type to make that field null-able.

      override def productElement(n: Int): Any =
        n match {
          case 0 => Year
          case 1 => Month
          case 2 => DayOfMonth
          case 3 => DayOfWeek
          case 4 => DepTime
          case 5 => CRSDepTime
          case 6 => ArrTime
          case 7 => CRSArrTime
          case 8 => UniqueCarrier
          case 9 => FlightNum
          case 10 => TailNum
          case 11 => ActualElapsedTime
          case 12 => CRSElapsedTime
          case 13 => AirTime
          case 14 => ArrDelay
          case 15 => DepDelay
          case 16 => Origin
          case 17 => Dest
          case 18 => Distance
          case 19 => TaxiIn
          case 20 => TaxiOut
          case 21 => Cancelled
          case 22 => CancellationCode
          case 23 => Diverted
          case 24 => CarrierDelay
          case 25 => WeatherDelay
          case 26 => NASDelay
          case 27 => SecurityDelay
          case 28 => LateAircraftDelay
          case _ => throw new IndexOutOfBoundsException(n.toString)
        }

      override def productArity: Int = 29

      override def canEqual(that: Any): Boolean = that.isInstanceOf[AirTraffic]
    }

//main.scala    
    val data = sparkContext.textFile("local-input/AIRLINE/2008.csv").map(_.split(","))
          .map(l => new AirTraffic(Try(l(0).trim.toInt).toOption, Try(l(1).trim.toInt).toOption, Try(l(2).trim.toInt).toOption, Try(l(3).trim.toInt).toOption,
          Try(l(4).trim.toInt).toOption, Try(l(5).trim.toInt).toOption, Try(l(6).trim.toInt).toOption, Try(l(7).trim.toInt).toOption,
          l(8).trim, Try(l(9).trim.toInt).toOption, l(10).trim, Try(l(11).trim.toInt).toOption,
          Try(l(12).trim.toInt).toOption, Try(l(13).trim.toInt).toOption, Try(l(14).trim.toInt).toOption, Try(l(15).trim.toInt).toOption,
          l(16).trim, l(17).trim, Try(l(18).trim.toInt).toOption, Try(l(19).trim.toInt).toOption, Try(l(20).trim.toInt).toOption,
          Try(l(21).trim.toBoolean).toOption, l(22).trim, Try(l(23).trim.toBoolean).toOption, Try(l(24).trim.toInt).toOption,
          Try(l(25).trim.toInt).toOption, Try(l(26).trim.toInt).toOption, Try(l(27).trim.toInt).toOption, Try(l(28).trim.toInt).toOption)).toDF()

        // register table with SQLContext
        data.registerTempTable("AirTraffic")

    val count = sqlContext.sql("SELECT COUNT(*) FROM AirTraffic").collect()
        count.foreach(print)

如果您认为它仍然丑陋，我们可以通过以下方式做更多：

implicit class StringConverter(val s: String) extends AnyVal {
    def tryGetInt = Try(s.trim.toInt).toOption

    def tryGetString = {
      val res = s.trim
      if (res.isEmpty) None else res
    }

    def tryGetBoolean = Try(s.trim.toBoolean).toOption
  }

然后

val data = sparkContext.textFile("local-input/AIRLINE/2008.csv").map(_.split(","))
      .map(l => new AirTraffic(l(0).tryGetInt, l(1).tryGetInt, l(2).tryGetInt, l(3).tryGetInt,
      l(4).tryGetInt, l(5).tryGetInt, l(6).tryGetInt, l(7).tryGetInt,
      l(8).trim, l(9).tryGetInt, l(10).trim, l(11).tryGetInt,
      l(12).tryGetInt, l(13).tryGetInt, l(14).tryGetInt, l(15).tryGetInt,
      l(16).trim, l(17).trim, l(18).tryGetInt, l(19).tryGetInt, l(20).tryGetInt,
      l(21).tryGetBoolean, l(22).trim, l(23).tryGetBoolean, l(24).tryGetInt,
      l(25).tryGetInt, l(26).tryGetInt, l(27).tryGetInt, l(28).tryGetInt)).toDF()