创建数据帧时如何解决scala.MatchError

创建数据帧时如何解决scala.MatchError,scala,dataframe,rdd,case-class,Scala,Dataframe,Rdd,Case Class,我有一个文本文件,它有复杂的结构化行。我正在使用客户转换器,它将给定的字符串(行)转换为Pojo类(countryInfo)。转换后,我正在构建DF。POJO类有一个字段,它是自定义类型的列表(GlobalizedPlayTimeWindows)。我创建了一个与此GlobalizedPlayTimeWindows匹配的结构,并尝试将现有的自定义类型转换为该结构,但不断出现错误 我创建的结构类型: import org.apache.spark.sql.types._ val PlayTim

我有一个文本文件,它有复杂的结构化行。我正在使用客户转换器,它将给定的字符串(行)转换为Pojo类(countryInfo)。转换后,我正在构建DF。POJO类有一个字段,它是自定义类型的列表(GlobalizedPlayTimeWindows)。我创建了一个与此GlobalizedPlayTimeWindows匹配的结构,并尝试将现有的自定义类型转换为该结构,但不断出现错误

我创建的结构类型:

import org.apache.spark.sql.types._

  val PlayTimeWindow =
    StructType(
      StructField("startTime", DateType, true) ::
        StructField("endTime", DateType, true) :: Nil)


  val globalizedPlayTimeWindows =
    StructType(
                StructField( "countries", ArrayType(StringType, true), true )  ::
        StructField( "purchase", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "rental", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "free", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "download", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "advertisement", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "playTypeIds", ArrayType(PlayTimeWindow, true), true )  ::
        StructField( "benefitIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true)  :: Nil)



  val schema =    StructType(
     StructField("id", StringType, true) ::
      StructField("jazzCount", IntegerType, true) ::
      StructField("rockCount", IntegerType, true) ::
      StructField("classicCount", IntegerType, true) ::
      StructField("nonclassicCount", IntegerType, true) ::
      StructField("musicType", StringType, true) ::
      StructField( "playType", ArrayType(globalizedPlayTimeWindows, true), true) :: Nil)
数据帧创建:

val mappingFile = sc.textFile("s3://input.....")

val inputData = mappingFile.map(x=> {
    val countryInfo = MappingUtils.getCountryInfo(x)

    val id = countryInfo.getId

    val musicType = if(countryInfo.getmusicType != null && StringUtils.isNotBlank(countryInfo.getmusicType)) countryInfo.getmusicType else "UNKOWN_TYPE"


    val classicWestern = if (countryInfo.getClassic() != null && countryInfo.getClassic.size() > 0) true  else false

    var nonclassicCount : Int = 0
    var  classicCount : Int = 0

    if (classicWestern) {
      classicCount = 1
    } else {
      nonclassicCount = 1
    }


    val jazzrock = if (countryInfo.getmusicType() != null && countryInfo.getmusicType != "JAZZ") true  else false
    var jazzCount : Int = 0
    var  rockCount : Int = 0

    if (jazzrock) {
      jazzCount = 1
    } else {
      rockCount = 1
    }

    val playType = if(countryInfo.getPlayTimeWindows != null && countryInfo.getPlayTimeWindows.size > 0 ) { countryInfo.getPlayTimeWindows.asScala.toList } else null

  (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType ,playType)
  }).map{case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType,playType) => Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType,playType)
  }.persist(DISK_ONLY)

 val inputDataDF = sqlContext.createDataFrame(inputData, schema)
inputDataDF.printSchema:

root 
|-- id: string (nullable = true) 
|-- jazzCount: integer (nullable = true) 
|-- rockCount: integer (nullable = true) 
|-- classicCount: integer (nullable = true) 
|-- nonclassicCount: integer (nullable = true) 
|-- musicType: string (nullable = true) 
|-- playType: array (nullable = true) 
| |-- element: struct (containsNull = true) 
| | |-- countries: array (nullable = true) 
| | | |-- element: string (containsNull = true) 
| | |-- purchase: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- rental: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- free: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- download: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- advertisement: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- playTypeIds: array (nullable = true) 
| | | |-- element: struct (containsNull = true) 
| | | | |-- startTime: date (nullable = true) 
| | | | |-- endTime: date (nullable = true) 
| | |-- benefitIds: map (nullable = true) 
| | | |-- key: string 
| | | |-- value: array (valueContainsNull = true) 
| | | | |-- element: struct (containsNull = true) 
| | | | | |-- startTime: date (nullable = true) 
| | | | | |-- endTime: date (nullable = true) 
结构的等效POJO:

@Data
public GlobalizedPlayTimeWindows(

    private final List<String> countries;

    private final List<PlayTimeWindow> purchase;

    private final List<PlayTimeWindow> rental;

    private final List<PlayTimeWindow> free;

    private final List<PlayTimeWindow> download;

    private final List<PlayTimeWindow> advertisement;

    private final List<PlayTimeWindow> preorderExclusive;

    private final Map<String, List<PlayTimeWindow>> playTypeIds;

}

@Data
public class PlayTimeWindow {

    private final Date startTime;

    private final Date endTime;
}

好的-为了缩短长时间的讨论,这里有一个有效的解决方案。基本上,这里有两个不同的问题:

  • 您希望Spark能够将任意Java类解析为数据帧—事实并非如此,Spark只能解析特定类型,通常是:Scala集合;原语
    java.sql.Date
    ;以及
    scala.Product
    的任何子类,例如所有case类和元组。正如评论中所讨论的,首先要做的是将现有结构转换成这样的类型

  • 您的
    架构
    也与您的Java类不匹配-存在一些差异:

    • 模式的
      playType
      是一个
      GlobalizedPlayTimeWindows
      数组,而您的代码创建的是单个项而不是数组
    • globalizedPlayTimeWindows
      schema包含的
      benefitId
      在Java类中不存在
    • playTypeIds
      schema是一个数组,而Java类中同名的字段是一个
      Map
  • 因此-我更正了所有这些(更改了模式以匹配数据,您可以选择以不同的方式修复它们,只要它们匹配),并完成了Java类到case类的转换:

    // corrected schemas:
    val PlayTimeWindow =
      StructType(
        StructField("startTime", DateType, true) ::
          StructField("endTime", DateType, true) :: Nil)
    
    val globalizedPlayTimeWindows =
      StructType(
        StructField( "countries", ArrayType(StringType, true), true )  ::
          StructField( "purchase", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "rental", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "free", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "download", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "advertisement", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "preorderExclusive", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "playTypeIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true )  ::
          Nil)
    
    val schema =    StructType(
      StructField("id", StringType, true) ::
        StructField("jazzCount", IntegerType, true) ::
        StructField("rockCount", IntegerType, true) ::
        StructField("classicCount", IntegerType, true) ::
        StructField("nonclassicCount", IntegerType, true) ::
        StructField("musicType", StringType, true) ::
        StructField( "playType", globalizedPlayTimeWindows, true) :: Nil)
    
    // note the use of java.sql.Date, java.util.Date not supported
    case class PlayTimeWindowScala(startTime: java.sql.Date, endTime: java.sql.Date)
    
    case class GlobalizedPlayTimeWindowsScala (countries: List[String],
                                               purchase: List[PlayTimeWindowScala],
                                               rental: List[PlayTimeWindowScala],
                                               free: List[PlayTimeWindowScala],
                                               download: List[PlayTimeWindowScala],
                                               advertisement: List[PlayTimeWindowScala],
                                               preorderExclusive: List[PlayTimeWindowScala],
                                               playTypeIds: Map[String, List[PlayTimeWindowScala]])
    
    // some conversion methods:
    def toSqlDate(jDate: java.util.Date): java.sql.Date = new java.sql.Date(jDate.getTime)
    
    import scala.collection.JavaConverters._
    
    def toScalaWindowList(l: java.util.List[PlayTimeWindow]): List[PlayTimeWindowScala] = {
      l.asScala.map(javaWindow => PlayTimeWindowScala(toSqlDate(javaWindow.startTime), toSqlDate(javaWindow.endTime))).toList
    }
    
    def toScalaGlobalizedWindows(javaObj: GlobalizedPlayTimeWindows): GlobalizedPlayTimeWindowsScala = {
      GlobalizedPlayTimeWindowsScala(
        javaObj.countries.asScala.toList,
        toScalaWindowList(javaObj.purchase),
        toScalaWindowList(javaObj.rental),
        toScalaWindowList(javaObj.free),
        toScalaWindowList(javaObj.download),
        toScalaWindowList(javaObj.advertisement),
        toScalaWindowList(javaObj.preorderExclusive),
        javaObj.playTypeIds.asScala.mapValues(toScalaWindowList).toMap
      )
    }
    
    val parsedJavaData: RDD[(String, Int, Int, Int, Int, String, GlobalizedPlayTimeWindows)] = mappingFile.map(x => {
       // your code producing the tuple
    })
    
    // convert to Scala objects and into a Row:
    val inputData = parsedJavaData.map{
      case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, javaPlayType) =>
        val scalaPlayType = toScalaGlobalizedWindows(javaPlayType)
        Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, scalaPlayType)
    }
    
    // now - this works
    val inputDataDF = sqlContext.createDataFrame(inputData, schema)
    

    很难遵循代码中的类型-没有显式的类型注释,
    countryInfo
    的代码缺失。。。尝试调用
    inputData.toDF().printSchema()
    (导入
    sqlContext.implicits.\u
    )以查看代码正在创建的实际架构,并查找该架构与预期架构之间的差异。我可以使用printSchema查看架构,我在尝试执行show()时遇到此错误您自己创建的模式可能与实际模式不匹配-我建议您不要使用自己的模式,而是让Spark推断出来(可以使用
    toDF
    ),然后比较这两个模式。@TzachZohar Yeah就像您建议的那样尝试过(inputData.toDF().printSchema())但是我得到了:java.lang.UnsupportedOperationException:org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:718)org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30)不支持com.model.GlobalizedPlayTimeWindows类型的模式是的,它成功了,你刚刚找到了根本问题!据我所知,Spark只能解析原语、集合和Scala
    Product
    s,它们基本上是case类和元组。它不能解析任意的Java类。如果
    countryInfo.getPlayTimeWindows
    返回
    GlobalizedPlayTimeWindows
    的列表,Spark将无法将其解析为数据帧。尝试将Java POJO转换为Scala case类,非常感谢您的简要解释。
    java.lang.UnsupportedOperationException: Schema for type com.model.global.GlobalizedPlayTimeWindows is not supported at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:718) at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30) at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:667) at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30) at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:693) at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:691) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at 
    
    // corrected schemas:
    val PlayTimeWindow =
      StructType(
        StructField("startTime", DateType, true) ::
          StructField("endTime", DateType, true) :: Nil)
    
    val globalizedPlayTimeWindows =
      StructType(
        StructField( "countries", ArrayType(StringType, true), true )  ::
          StructField( "purchase", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "rental", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "free", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "download", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "advertisement", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "preorderExclusive", ArrayType(PlayTimeWindow, true), true )  ::
          StructField( "playTypeIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true )  ::
          Nil)
    
    val schema =    StructType(
      StructField("id", StringType, true) ::
        StructField("jazzCount", IntegerType, true) ::
        StructField("rockCount", IntegerType, true) ::
        StructField("classicCount", IntegerType, true) ::
        StructField("nonclassicCount", IntegerType, true) ::
        StructField("musicType", StringType, true) ::
        StructField( "playType", globalizedPlayTimeWindows, true) :: Nil)
    
    // note the use of java.sql.Date, java.util.Date not supported
    case class PlayTimeWindowScala(startTime: java.sql.Date, endTime: java.sql.Date)
    
    case class GlobalizedPlayTimeWindowsScala (countries: List[String],
                                               purchase: List[PlayTimeWindowScala],
                                               rental: List[PlayTimeWindowScala],
                                               free: List[PlayTimeWindowScala],
                                               download: List[PlayTimeWindowScala],
                                               advertisement: List[PlayTimeWindowScala],
                                               preorderExclusive: List[PlayTimeWindowScala],
                                               playTypeIds: Map[String, List[PlayTimeWindowScala]])
    
    // some conversion methods:
    def toSqlDate(jDate: java.util.Date): java.sql.Date = new java.sql.Date(jDate.getTime)
    
    import scala.collection.JavaConverters._
    
    def toScalaWindowList(l: java.util.List[PlayTimeWindow]): List[PlayTimeWindowScala] = {
      l.asScala.map(javaWindow => PlayTimeWindowScala(toSqlDate(javaWindow.startTime), toSqlDate(javaWindow.endTime))).toList
    }
    
    def toScalaGlobalizedWindows(javaObj: GlobalizedPlayTimeWindows): GlobalizedPlayTimeWindowsScala = {
      GlobalizedPlayTimeWindowsScala(
        javaObj.countries.asScala.toList,
        toScalaWindowList(javaObj.purchase),
        toScalaWindowList(javaObj.rental),
        toScalaWindowList(javaObj.free),
        toScalaWindowList(javaObj.download),
        toScalaWindowList(javaObj.advertisement),
        toScalaWindowList(javaObj.preorderExclusive),
        javaObj.playTypeIds.asScala.mapValues(toScalaWindowList).toMap
      )
    }
    
    val parsedJavaData: RDD[(String, Int, Int, Int, Int, String, GlobalizedPlayTimeWindows)] = mappingFile.map(x => {
       // your code producing the tuple
    })
    
    // convert to Scala objects and into a Row:
    val inputData = parsedJavaData.map{
      case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, javaPlayType) =>
        val scalaPlayType = toScalaGlobalizedWindows(javaPlayType)
        Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, scalaPlayType)
    }
    
    // now - this works
    val inputDataDF = sqlContext.createDataFrame(inputData, schema)