使用产品接口扩展scala类以克服spark shell中22个字段的限制时遇到错误

使用产品接口扩展scala类以克服spark shell中22个字段的限制时遇到错误,scala,apache-spark,apache-spark-sql,Scala,Apache Spark,Apache Spark Sql,我需要创建一个类模式来支持29个字段。由于case类有22个字段的限制,我尝试用产品接口扩展我的类“sdp_d”,如下所示: class sdp_d( WID :Option[Int], BATCH_ID :Option[Int], SRC_ID :Option[String], ORG_ID :Option[Int], CLASS_WID :Option[Int], DESC_TEXT :Option[String], PREMISE_WID :Option[Int], FEED_LOC :

我需要创建一个类模式来支持29个字段。由于case类有22个字段的限制,我尝试用产品接口扩展我的类“sdp_d”,如下所示:

class sdp_d( WID :Option[Int], BATCH_ID :Option[Int], SRC_ID :Option[String], ORG_ID :Option[Int], CLASS_WID :Option[Int],  DESC_TEXT :Option[String], PREMISE_WID :Option[Int], FEED_LOC :Option[String], GPS_LAT :Option[Double], GPS_LONG :Option[Double], PULSE_OUTPUT_BLOCK :Option[String], UDC_ID :Option[String], UNIVERSAL_ID :Option[String], IS_VIRTUAL_FLG :Option[String], SEAL_INFO :Option[String], ACCESS_INFO :Option[String], ALT_ACCESS_INFO :Option[String], LOC_INFO :Option[String], ALT_LOC_INFO :Option[String], TYPE :Option[String], SUB_TYPE :Option[String], TIMEZONE_ID :Option[Int], GIS_ID :Option[String], BILLED_UPTO_TIME :Option[java.sql.Timestamp], POWER_STATUS :Option[String], LOAD_STATUS :Option[String], BILLING_HOLD_STATUS :Option[String], INSERT_TIME :Option[java.sql.Timestamp], LAST_UPD_TIME :Option[java.sql.Timestamp]) extends Product{

@throws(classOf[IndexOutOfBoundsException])
override def productElement(n: Int) = n match 
{
    case 0 => WID
    case 1 => BATCH_ID
    case 2 => SRC_ID
    case 3 => ORG_ID
    case 4 => CLASS_WID
    case 5 => DESC_TEXT
    case 6 => PREMISE_WID
    case 7 => FEED_LOC
    case 8 => GPS_LAT
    case 9 => GPS_LONG
    case 10 => PULSE_OUTPUT_BLOCK
    case 11 => UDC_ID
    case 12 => UNIVERSAL_ID
    case 13 => IS_VIRTUAL_FLG
    case 14 => SEAL_INFO
    case 15 => ACCESS_INFO
    case 16 => ALT_ACCESS_INFO
    case 17 => LOC_INFO
    case 18 => ALT_LOC_INFO
    case 19 => TYPE
    case 20 => SUB_TYPE
    case 21 => TIMEZONE_ID
    case 22 => GIS_ID
    case 23 => BILLED_UPTO_TIME
    case 24 => POWER_STATUS
    case 25 => LOAD_STATUS
    case 26 => BILLING_HOLD_STATUS
    case 27 => INSERT_TIME
    case 28 => LAST_UPD_TIME
    case _ => throw new IndexOutOfBoundsException(n.toString())
}

override def productArity: Int = 29
override def canEqual(that: Any): Boolean = that.isInstanceOf[sdp_d]
}

这定义了“sdp_d”类。但是,当我尝试使用此预定义架构将csv数据加载到中并将其注册为表时,我遇到一个错误:

> scala> import java.text.SimpleDateFormat; val sdf = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.S"); import java.util.Calendar; import java.util.Date; val calendar = Calendar.getInstance()
import java.text.SimpleDateFormat
sdf: java.text.SimpleDateFormat = java.text.SimpleDateFormat@cce61785
import java.util.Calendar
import java.util.Date
calendar: java.util.Calendar = java.util.GregorianCalendar[time=1424687963209,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Asia/Kolkata",offset=19800000,dstSavings=0,useDaylight=false,transitions=6,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2015,MONTH=1,WEEK_OF_YEAR=9,WEEK_OF_MONTH=4,DAY_OF_MONTH=23,DAY_OF_YEAR=54,DAY_OF_WEEK=2,DAY_OF_WEEK_IN_MONTH=4,AM_PM=1,HOUR=4,HOUR_OF_DAY=16,MINUTE=9,SECOND=23,MILLISECOND=209,ZONE_OFFSET=19800000,DST_OFFSET=0]

    > scala> sc.textFile("hdfs://CDH-Master-1.cdhcluster/user/spark/Sdp_d.csv").map(_.split(",")).map { r =>
         | val upto_time = sdf.parse(r(23).trim);
         | calendar.setTime(upto_time); 
         | val r23 = new java.sql.Timestamp(upto_time.getTime); 
         | 
         | val insert_time = sdf.parse(r(26).trim); 
         | calendar.setTime(insert_time); 
         | val r26 = new java.sql.Timestamp(insert_time.getTime); 
         | 
         | val last_upd_time = sdf.parse(r(27).trim);
         | calendar.setTime(last_upd_time); 
         | val r27 = new java.sql.Timestamp(last_upd_time.getTime); 
         | 
         | sdp_d(r(0).trim.toInt, r(1).trim.toInt, r(2).trim, r(3).trim.toInt, r(4).trim.toInt, r(5).trim, r(6).trim.toInt, r(7).trim, r(8).trim.toDouble, r(9).trim.toDouble, r(10).trim, r(11).trim, r(12).trim, r(13).trim, r(14).trim, r(15).trim, r(16).trim, r(17).trim, r(18).trim, r(19).trim, r(20).trim, r(21).trim.toInt, r(22).trim, r23, r(24).trim, r(25).trim, r26, r27, r(28).trim)
         | }.registerAsTable("sdp")
    <console>:36: error: not found: value sdp_d
                  sdp_d(r(0).trim.toInt, r(1).trim.toInt, r(2).trim, r(3).trim.toInt, r(4).trim.toInt, r(5).trim, r(6).trim.toInt, r(7).trim, r(8).trim.toDouble, r(9).trim.toDouble, r(10).trim, r(11).trim, r(12).trim, r(13).trim, r(14).trim, r(15).trim, r(16).trim, r(17).trim, r(18).trim, r(19).trim, r(20).trim, r(21).trim.toInt, r(22).trim, r23, r(24).trim, r(25).trim, r26, r27, r(28).trim)
                  ^`
>scala>导入java.text.simpleDataFormat;val sdf=新的简化格式(“yyyy-mm-dd hh:mm:ss.S”);导入java.util.Calendar;导入java.util.Date;val calendar=calendar.getInstance()
导入java.text.simpleDataFormat
sdf:java.text.simpleDataFormat=java.text。SimpleDateFormat@cce61785
导入java.util.Calendar
导入java.util.Date
calendar:java.util.calendar=java.util.GregorianCalendar[time=1424687963209,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id=“Asia/Kolkata”,offset=19800000,dstSaves=0,useDaylight=false,transitions=6,lastRule=null],周的第一天=1,周的第一天=1,纪元=1,年=2015,月=1,年的第周=9,月的第周=4,月的第日=23,年的第日=54,周的第日=2,周的第日=4,上午下午=1,小时=4,日的第小时=16,分钟=9,秒=23,毫秒=209,分区偏移=19800000,DST偏移=0]
>scala>sc.textFile(“hdfs://CDH-Master-1.cdhcluster/user/spark/Sdp_d.csv.map(u.split(“,”).map{r=>
|val upto_time=sdf.parse(r(23.trim));
|日历设置时间(截止时间);
|val r23=new java.sql.Timestamp(upto_time.getTime);
| 
|val insert_time=sdf.parse(r(26).trim);
|日历设置时间(插入时间);
|val r26=newjava.sql.Timestamp(insert_time.getTime);
| 
|val last_upd_time=sdf.parse(r(27.trim));
|日历设置时间(上次更新时间);
|val r27=newjava.sql.Timestamp(last_upd_time.getTime);
| 
|特姆,r(3)特姆,r(4)特姆,r(5)特姆,r(6)特姆,r(7)特姆,r(8)特姆,r(1)特姆,r(9)特姆,r(10)特姆,r(11)特姆,r(12)特姆,r(13)特姆,r(14)特姆,r(15)特姆,r(16)特姆,r(17)特姆,r(18)特姆,r(19)特姆,r(9)特姆,特姆,r(10)特姆,r(11)特姆,特姆,r(12)特姆,r(12)特姆,r(13)特姆,特姆,r(14)特姆,r(14)特姆,r(14)特姆,r(14)特姆,r(17)特姆,r(17)特姆,r(17)特姆,r(18)特姆.饰件,r26,r27,r(28).饰件
|}.registerAsTable(“sdp”)
:36:错误:未找到:值sdp\U d
特姆,r(3)特姆,r(4)特姆,r(5)特姆,r(6)特姆,r(7)特姆,r(8)特姆,r(1)特姆,r(9)特姆,r(10)特姆,r(11)特姆,r(12)特姆,r(13)特姆,r(14)特姆,r(15)特姆,r(16)特姆,r(17)特姆,r(18)特姆,r(19)特姆,r(9)特姆,特姆,r(10)特姆,r(11)特姆,特姆,r(12)特姆,r(12)特姆,r(13)特姆,特姆,r(14)特姆,r(14)特姆,r(14)特姆,r(14)特姆,r(17)特姆,r(17)特姆,r(17)特姆,r(18)特姆.饰件,r26,r27,r(28).饰件
^`
我在spark shell工作。Spark版本1.1.0和scala版本2.10.4

我不明白为什么会出现错误:未找到:值sdp\d

当我创建自己的类扩展产品接口时,我应该如何注册可删除性


请帮助解决此错误。

您只需使用
new
实例化该类即可:

new sdp_d(r(0).trim.toInt, r(1).trim.toInt, ...

您是否碰巧看过

您可以:

  • 用new关键字实例化
    new sdp\u d(…)
  • 您将字段声明为Option[T],例如Option[Int],因此我们需要将Option[T]作为参数传递(部分或无)。
    new sdp\u d(Try(r(0).trim.toInt).toOption,Try(r(1).trim.toInt).toOption,l(2).trim.toOption,…)
  • 这对我很有用:

    //AirTraffic.scala
            class AirTraffic(Year:Option[Int], Month:Option[Int], DayOfMonth:Option[Int], DayOfWeek:Option[Int],
                         DepTime:Option[Int], CRSDepTime:Option[Int], ArrTime:Option[Int], CRSArrTime:Option[Int],
                         UniqueCarrier:String, FlightNum:Option[Int], TailNum:String, ActualElapsedTime:Option[Int],
                         CRSElapsedTime:Option[Int], AirTime:Option[Int], ArrDelay:Option[Int], DepDelay:Option[Int],
                         Origin:String, Dest:String, Distance:Option[Int], TaxiIn:Option[Int], TaxiOut:Option[Int],
                         Cancelled:Option[Boolean], CancellationCode:String, Diverted:Option[Boolean], CarrierDelay:Option[Int],
                         WeatherDelay:Option[Int], NASDelay:Option[Int], SecurityDelay:Option[Int], LateAircraftDelay:Option[Int]) extends Product {
    
          // We declare field with Option[T] type to make that field null-able.
    
          override def productElement(n: Int): Any =
            n match {
              case 0 => Year
              case 1 => Month
              case 2 => DayOfMonth
              case 3 => DayOfWeek
              case 4 => DepTime
              case 5 => CRSDepTime
              case 6 => ArrTime
              case 7 => CRSArrTime
              case 8 => UniqueCarrier
              case 9 => FlightNum
              case 10 => TailNum
              case 11 => ActualElapsedTime
              case 12 => CRSElapsedTime
              case 13 => AirTime
              case 14 => ArrDelay
              case 15 => DepDelay
              case 16 => Origin
              case 17 => Dest
              case 18 => Distance
              case 19 => TaxiIn
              case 20 => TaxiOut
              case 21 => Cancelled
              case 22 => CancellationCode
              case 23 => Diverted
              case 24 => CarrierDelay
              case 25 => WeatherDelay
              case 26 => NASDelay
              case 27 => SecurityDelay
              case 28 => LateAircraftDelay
              case _ => throw new IndexOutOfBoundsException(n.toString)
            }
    
          override def productArity: Int = 29
    
          override def canEqual(that: Any): Boolean = that.isInstanceOf[AirTraffic]
        }
    
    //main.scala    
        val data = sparkContext.textFile("local-input/AIRLINE/2008.csv").map(_.split(","))
              .map(l => new AirTraffic(Try(l(0).trim.toInt).toOption, Try(l(1).trim.toInt).toOption, Try(l(2).trim.toInt).toOption, Try(l(3).trim.toInt).toOption,
              Try(l(4).trim.toInt).toOption, Try(l(5).trim.toInt).toOption, Try(l(6).trim.toInt).toOption, Try(l(7).trim.toInt).toOption,
              l(8).trim, Try(l(9).trim.toInt).toOption, l(10).trim, Try(l(11).trim.toInt).toOption,
              Try(l(12).trim.toInt).toOption, Try(l(13).trim.toInt).toOption, Try(l(14).trim.toInt).toOption, Try(l(15).trim.toInt).toOption,
              l(16).trim, l(17).trim, Try(l(18).trim.toInt).toOption, Try(l(19).trim.toInt).toOption, Try(l(20).trim.toInt).toOption,
              Try(l(21).trim.toBoolean).toOption, l(22).trim, Try(l(23).trim.toBoolean).toOption, Try(l(24).trim.toInt).toOption,
              Try(l(25).trim.toInt).toOption, Try(l(26).trim.toInt).toOption, Try(l(27).trim.toInt).toOption, Try(l(28).trim.toInt).toOption)).toDF()
    
            // register table with SQLContext
            data.registerTempTable("AirTraffic")
    
        val count = sqlContext.sql("SELECT COUNT(*) FROM AirTraffic").collect()
            count.foreach(print)
    
    如果您认为它仍然丑陋,我们可以通过以下方式做更多:

    implicit class StringConverter(val s: String) extends AnyVal {
        def tryGetInt = Try(s.trim.toInt).toOption
    
        def tryGetString = {
          val res = s.trim
          if (res.isEmpty) None else res
        }
    
        def tryGetBoolean = Try(s.trim.toBoolean).toOption
      }
    
    然后

    val data = sparkContext.textFile("local-input/AIRLINE/2008.csv").map(_.split(","))
          .map(l => new AirTraffic(l(0).tryGetInt, l(1).tryGetInt, l(2).tryGetInt, l(3).tryGetInt,
          l(4).tryGetInt, l(5).tryGetInt, l(6).tryGetInt, l(7).tryGetInt,
          l(8).trim, l(9).tryGetInt, l(10).trim, l(11).tryGetInt,
          l(12).tryGetInt, l(13).tryGetInt, l(14).tryGetInt, l(15).tryGetInt,
          l(16).trim, l(17).trim, l(18).tryGetInt, l(19).tryGetInt, l(20).tryGetInt,
          l(21).tryGetBoolean, l(22).trim, l(23).tryGetBoolean, l(24).tryGetInt,
          l(25).tryGetInt, l(26).tryGetInt, l(27).tryGetInt, l(28).tryGetInt)).toDF()