创建数据帧时如何解决scala.MatchError
我有一个文本文件,它有复杂的结构化行。我正在使用客户转换器,它将给定的字符串(行)转换为Pojo类(countryInfo)。转换后,我正在构建DF。POJO类有一个字段,它是自定义类型的列表(GlobalizedPlayTimeWindows)。我创建了一个与此GlobalizedPlayTimeWindows匹配的结构,并尝试将现有的自定义类型转换为该结构,但不断出现错误 我创建的结构类型:创建数据帧时如何解决scala.MatchError,scala,dataframe,rdd,case-class,Scala,Dataframe,Rdd,Case Class,我有一个文本文件,它有复杂的结构化行。我正在使用客户转换器,它将给定的字符串(行)转换为Pojo类(countryInfo)。转换后,我正在构建DF。POJO类有一个字段,它是自定义类型的列表(GlobalizedPlayTimeWindows)。我创建了一个与此GlobalizedPlayTimeWindows匹配的结构,并尝试将现有的自定义类型转换为该结构,但不断出现错误 我创建的结构类型: import org.apache.spark.sql.types._ val PlayTim
import org.apache.spark.sql.types._
val PlayTimeWindow =
StructType(
StructField("startTime", DateType, true) ::
StructField("endTime", DateType, true) :: Nil)
val globalizedPlayTimeWindows =
StructType(
StructField( "countries", ArrayType(StringType, true), true ) ::
StructField( "purchase", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "rental", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "free", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "download", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "advertisement", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "playTypeIds", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "benefitIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true) :: Nil)
val schema = StructType(
StructField("id", StringType, true) ::
StructField("jazzCount", IntegerType, true) ::
StructField("rockCount", IntegerType, true) ::
StructField("classicCount", IntegerType, true) ::
StructField("nonclassicCount", IntegerType, true) ::
StructField("musicType", StringType, true) ::
StructField( "playType", ArrayType(globalizedPlayTimeWindows, true), true) :: Nil)
数据帧创建:
val mappingFile = sc.textFile("s3://input.....")
val inputData = mappingFile.map(x=> {
val countryInfo = MappingUtils.getCountryInfo(x)
val id = countryInfo.getId
val musicType = if(countryInfo.getmusicType != null && StringUtils.isNotBlank(countryInfo.getmusicType)) countryInfo.getmusicType else "UNKOWN_TYPE"
val classicWestern = if (countryInfo.getClassic() != null && countryInfo.getClassic.size() > 0) true else false
var nonclassicCount : Int = 0
var classicCount : Int = 0
if (classicWestern) {
classicCount = 1
} else {
nonclassicCount = 1
}
val jazzrock = if (countryInfo.getmusicType() != null && countryInfo.getmusicType != "JAZZ") true else false
var jazzCount : Int = 0
var rockCount : Int = 0
if (jazzrock) {
jazzCount = 1
} else {
rockCount = 1
}
val playType = if(countryInfo.getPlayTimeWindows != null && countryInfo.getPlayTimeWindows.size > 0 ) { countryInfo.getPlayTimeWindows.asScala.toList } else null
(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType ,playType)
}).map{case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType,playType) => Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType,playType)
}.persist(DISK_ONLY)
val inputDataDF = sqlContext.createDataFrame(inputData, schema)
inputDataDF.printSchema:
root
|-- id: string (nullable = true)
|-- jazzCount: integer (nullable = true)
|-- rockCount: integer (nullable = true)
|-- classicCount: integer (nullable = true)
|-- nonclassicCount: integer (nullable = true)
|-- musicType: string (nullable = true)
|-- playType: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- countries: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- purchase: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- rental: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- free: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- download: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- advertisement: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- playTypeIds: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- startTime: date (nullable = true)
| | | | |-- endTime: date (nullable = true)
| | |-- benefitIds: map (nullable = true)
| | | |-- key: string
| | | |-- value: array (valueContainsNull = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- startTime: date (nullable = true)
| | | | | |-- endTime: date (nullable = true)
结构的等效POJO:
@Data
public GlobalizedPlayTimeWindows(
private final List<String> countries;
private final List<PlayTimeWindow> purchase;
private final List<PlayTimeWindow> rental;
private final List<PlayTimeWindow> free;
private final List<PlayTimeWindow> download;
private final List<PlayTimeWindow> advertisement;
private final List<PlayTimeWindow> preorderExclusive;
private final Map<String, List<PlayTimeWindow>> playTypeIds;
}
@Data
public class PlayTimeWindow {
private final Date startTime;
private final Date endTime;
}
好的-为了缩短长时间的讨论,这里有一个有效的解决方案。基本上,这里有两个不同的问题:
java.sql.Date
;以及scala.Product
的任何子类,例如所有case类和元组。正如评论中所讨论的,首先要做的是将现有结构转换成这样的类型架构
也与您的Java类不匹配-存在一些差异:
- 模式的
是一个playType
数组,而您的代码创建的是单个项而不是数组GlobalizedPlayTimeWindows
schema包含的globalizedPlayTimeWindows
在Java类中不存在benefitId
schema是一个数组,而Java类中同名的字段是一个playTypeIds
Map
// corrected schemas:
val PlayTimeWindow =
StructType(
StructField("startTime", DateType, true) ::
StructField("endTime", DateType, true) :: Nil)
val globalizedPlayTimeWindows =
StructType(
StructField( "countries", ArrayType(StringType, true), true ) ::
StructField( "purchase", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "rental", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "free", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "download", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "advertisement", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "preorderExclusive", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "playTypeIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true ) ::
Nil)
val schema = StructType(
StructField("id", StringType, true) ::
StructField("jazzCount", IntegerType, true) ::
StructField("rockCount", IntegerType, true) ::
StructField("classicCount", IntegerType, true) ::
StructField("nonclassicCount", IntegerType, true) ::
StructField("musicType", StringType, true) ::
StructField( "playType", globalizedPlayTimeWindows, true) :: Nil)
// note the use of java.sql.Date, java.util.Date not supported
case class PlayTimeWindowScala(startTime: java.sql.Date, endTime: java.sql.Date)
case class GlobalizedPlayTimeWindowsScala (countries: List[String],
purchase: List[PlayTimeWindowScala],
rental: List[PlayTimeWindowScala],
free: List[PlayTimeWindowScala],
download: List[PlayTimeWindowScala],
advertisement: List[PlayTimeWindowScala],
preorderExclusive: List[PlayTimeWindowScala],
playTypeIds: Map[String, List[PlayTimeWindowScala]])
// some conversion methods:
def toSqlDate(jDate: java.util.Date): java.sql.Date = new java.sql.Date(jDate.getTime)
import scala.collection.JavaConverters._
def toScalaWindowList(l: java.util.List[PlayTimeWindow]): List[PlayTimeWindowScala] = {
l.asScala.map(javaWindow => PlayTimeWindowScala(toSqlDate(javaWindow.startTime), toSqlDate(javaWindow.endTime))).toList
}
def toScalaGlobalizedWindows(javaObj: GlobalizedPlayTimeWindows): GlobalizedPlayTimeWindowsScala = {
GlobalizedPlayTimeWindowsScala(
javaObj.countries.asScala.toList,
toScalaWindowList(javaObj.purchase),
toScalaWindowList(javaObj.rental),
toScalaWindowList(javaObj.free),
toScalaWindowList(javaObj.download),
toScalaWindowList(javaObj.advertisement),
toScalaWindowList(javaObj.preorderExclusive),
javaObj.playTypeIds.asScala.mapValues(toScalaWindowList).toMap
)
}
val parsedJavaData: RDD[(String, Int, Int, Int, Int, String, GlobalizedPlayTimeWindows)] = mappingFile.map(x => {
// your code producing the tuple
})
// convert to Scala objects and into a Row:
val inputData = parsedJavaData.map{
case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, javaPlayType) =>
val scalaPlayType = toScalaGlobalizedWindows(javaPlayType)
Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, scalaPlayType)
}
// now - this works
val inputDataDF = sqlContext.createDataFrame(inputData, schema)
很难遵循代码中的类型-没有显式的类型注释,
countryInfo
的代码缺失。。。尝试调用inputData.toDF().printSchema()
(导入sqlContext.implicits.\u
)以查看代码正在创建的实际架构,并查找该架构与预期架构之间的差异。我可以使用printSchema查看架构,我在尝试执行show()时遇到此错误您自己创建的模式可能与实际模式不匹配-我建议您不要使用自己的模式,而是让Spark推断出来(可以使用toDF
),然后比较这两个模式。@TzachZohar Yeah就像您建议的那样尝试过(inputData.toDF().printSchema())但是我得到了:java.lang.UnsupportedOperationException:org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:718)org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30)不支持com.model.GlobalizedPlayTimeWindows类型的模式是的,它成功了,你刚刚找到了根本问题!据我所知,Spark只能解析原语、集合和ScalaProduct
s,它们基本上是case类和元组。它不能解析任意的Java类。如果countryInfo.getPlayTimeWindows
返回GlobalizedPlayTimeWindows
的列表,Spark将无法将其解析为数据帧。尝试将Java POJO转换为Scala case类,非常感谢您的简要解释。
java.lang.UnsupportedOperationException: Schema for type com.model.global.GlobalizedPlayTimeWindows is not supported at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:718) at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30) at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:667) at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:30) at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:693) at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$schemaFor$1.apply(ScalaReflection.scala:691) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at
// corrected schemas:
val PlayTimeWindow =
StructType(
StructField("startTime", DateType, true) ::
StructField("endTime", DateType, true) :: Nil)
val globalizedPlayTimeWindows =
StructType(
StructField( "countries", ArrayType(StringType, true), true ) ::
StructField( "purchase", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "rental", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "free", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "download", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "advertisement", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "preorderExclusive", ArrayType(PlayTimeWindow, true), true ) ::
StructField( "playTypeIds", MapType(StringType, ArrayType(PlayTimeWindow, true), true), true ) ::
Nil)
val schema = StructType(
StructField("id", StringType, true) ::
StructField("jazzCount", IntegerType, true) ::
StructField("rockCount", IntegerType, true) ::
StructField("classicCount", IntegerType, true) ::
StructField("nonclassicCount", IntegerType, true) ::
StructField("musicType", StringType, true) ::
StructField( "playType", globalizedPlayTimeWindows, true) :: Nil)
// note the use of java.sql.Date, java.util.Date not supported
case class PlayTimeWindowScala(startTime: java.sql.Date, endTime: java.sql.Date)
case class GlobalizedPlayTimeWindowsScala (countries: List[String],
purchase: List[PlayTimeWindowScala],
rental: List[PlayTimeWindowScala],
free: List[PlayTimeWindowScala],
download: List[PlayTimeWindowScala],
advertisement: List[PlayTimeWindowScala],
preorderExclusive: List[PlayTimeWindowScala],
playTypeIds: Map[String, List[PlayTimeWindowScala]])
// some conversion methods:
def toSqlDate(jDate: java.util.Date): java.sql.Date = new java.sql.Date(jDate.getTime)
import scala.collection.JavaConverters._
def toScalaWindowList(l: java.util.List[PlayTimeWindow]): List[PlayTimeWindowScala] = {
l.asScala.map(javaWindow => PlayTimeWindowScala(toSqlDate(javaWindow.startTime), toSqlDate(javaWindow.endTime))).toList
}
def toScalaGlobalizedWindows(javaObj: GlobalizedPlayTimeWindows): GlobalizedPlayTimeWindowsScala = {
GlobalizedPlayTimeWindowsScala(
javaObj.countries.asScala.toList,
toScalaWindowList(javaObj.purchase),
toScalaWindowList(javaObj.rental),
toScalaWindowList(javaObj.free),
toScalaWindowList(javaObj.download),
toScalaWindowList(javaObj.advertisement),
toScalaWindowList(javaObj.preorderExclusive),
javaObj.playTypeIds.asScala.mapValues(toScalaWindowList).toMap
)
}
val parsedJavaData: RDD[(String, Int, Int, Int, Int, String, GlobalizedPlayTimeWindows)] = mappingFile.map(x => {
// your code producing the tuple
})
// convert to Scala objects and into a Row:
val inputData = parsedJavaData.map{
case (id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, javaPlayType) =>
val scalaPlayType = toScalaGlobalizedWindows(javaPlayType)
Row(id, jazzCount, rockCount, classicCount, nonclassicCount, musicType, scalaPlayType)
}
// now - this works
val inputDataDF = sqlContext.createDataFrame(inputData, schema)