Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 如何使用spark将拼花地板数据转换为案例类?_Scala_Apache Spark_Apache Spark Sql - Fatal编程技术网

Scala 如何使用spark将拼花地板数据转换为案例类?

Scala 如何使用spark将拼花地板数据转换为案例类?,scala,apache-spark,apache-spark-sql,Scala,Apache Spark,Apache Spark Sql,我在spark中使用了大量case类将数据保存为拼花地板,例如: case class Person(userId: String, technographic: Option[Technographic] = None, geographic: Option[Geographic] = None) case class Technographic(browsers: Seq[Browser],

我在spark中使用了大量case类将数据保存为拼花地板,例如:

case class Person(userId: String,
              technographic: Option[Technographic] = None,
              geographic: Option[Geographic] = None)

case class Technographic(browsers: Seq[Browser], 
                     devices: Seq[Device],
                     oss: Seq[Os])

case class Browser(family: String,
               major: Option[String] = None, 
               language: String

...
如何将磁盘上的数据转换回这些案例类

我需要能够选择多个列并分解它们,以便每个列表(例如,
浏览器
)的所有子列表具有相同的长度

例如,鉴于此原始数据:

Person(userId="1234",
  technographic=Some(Technographic(browsers=Seq(
    Browser(family=Some("IE"), major=Some(7), language=Some("en")),
    Browser(family=None, major=None, language=Some("en-us")),
    Browser(family=Some("Firefox), major=None, language=None)
  )),
  geographic=Some(Geographic(...))
)
例如,我需要如下浏览器数据(以及能够选择所有列):

如果spark能够分解每个列表项,我就可以得到它。目前,它只会执行以下操作(而且无论如何,
explode
不会处理多个列):

那么,如何使用spark 1.5.2从所有这些嵌套可选数据重建用户记录(生成一行数据的整个案例类集合)

一种可能的办法是:

val df = sqlContext.read.parquet(inputPath)
df.registerTempTable("person")
val fields = df.select("desc person")
df.select("select * from person").map { x => 
  ... // somehow zip `fields` with the values so that I can 
      // access values by column name instead of index 
      // (which is brittle), but how?
}
给定

以及
org.apache.spark.sql.Row

type A[E] = collection.mutable.WrappedArray[E]

implicit class RichRow(val r: Row) {
  def getOpt[T](n: String): Option[T] = {
    if (isNullAt(n)) {
      None
    } else {
      Some(r.getAs[T](n))
    }
  }

  def getStringOpt(n: String) = getOpt[String](n)
  def getString(n: String) = getStringOpt(n).get

  def getIntOpt(n: String) = getOpt[Int](n)
  def getInt(n: String) = r.getIntOpt(n).get

  def getArray[T](n: String) = r.getAs[A[T]](n)

  def getRow(n: String) = r.getAs[Row](n)
  def getRows(n: String) = r.getAs[A[Row]](n)

  def isNullAt(n: String) = r.isNullAt(r.fieldIndex(n))
}
然后可以在一些函数中组织解析:

def toBrowser(r: Row): Browser = {
  Browser(
    r.getString("family"),
    r.getIntOpt("major"),
    r.getString("language"))
}

def toBrowsers(rows: A[Row]): Seq[Browser] = {
  rows.map(toBrowser)
}

def toTech(r: Row): Tech = {
  Tech(
    toBrowsers(r.getRows("browsers")),
    r.getArray[String]("devices"),
    r.getArray[String]("oss"))
}

def toTechOpt(r: Row): Option[Tech] = {
  Option(r).map(toTech)
}

def toPerson(r: Row): Person = {
  Person(
    r.getString("userId"),
    toTechOpt(r.getRow("tech")),
    r.getStringOpt("geographic"))
}
这样你就可以写作了

df.map(toPerson).collect().foreach(println)

  • 我已经将解析函数组织为“独立”方法。我通常会将它们作为
    apply
    放入case类的伴生对象中,或者作为
    Row
    的隐式值类。之所以使用这些函数,是因为它更容易粘贴到火花壳中

  • 每个解析函数直接处理普通列和数组,但在遇到集合时会委托给另一个函数(
    Seq
    Option
    ——它们代表下一个嵌套级别)

  • implict类应
    扩展AnyVal
    ,但不能将其粘贴到
    spark外壳中


详细说明接受的答案,它没有正确处理空值。您需要尝试将其转换为字符串,以确定它是否为null。但是,只有当值为null时,该操作才会成功-如果值为非null,则将导致强制转换异常

困惑?以下是德科兹:

implicit class RichRow(val r: Row) extends AnyVal {

    def getBoolean(n: String) = r.getAs[Boolean](n)
    def getBooleanOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getBoolean(n))
    }

    def getString(n: String) = r.getAs[String](n)
    def getStringOpt(n: String) = Option(r.getString(n))

    def getLong(n: String) = r.getAs[Long](n)
    def getLongOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getLong(n))
    } 

    def getInt(n: String) = r.getAs[Int](n)
    def getIntOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getInt(n))
    } 

    def getFloat(n: String) = r.getAs[Float](n)
    def getFloatOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getFloat(n))
    }

    def getArray[T](n: String) = r.getAs[A[T]](n)

    def getRow(n: String) = r.getAs[Row](n)
    def getRows(n: String): A[Row] = r.getAs[A[Row]](n)
  }
}

你能发布你的预期结果吗?可能是重复的谢谢你。我很惊讶这并没有更好的文档化,因为加载数据是一件非常关键的事情,从案例类中保存数据非常简单。不客气。我也很惊讶(尤其是在嵌套很深的值的情况下)。我也没有找到基于宏的解决方案。@jbrown我添加了
RichRow.getOpt[t]
:此版本使用
isNullAt
并避免
getAs
,因为后者使用
asInstanceOf
。换句话说,如果它真的不是空的,那么只有一个
asInstanceOf
def toBrowser(r: Row): Browser = {
  Browser(
    r.getString("family"),
    r.getIntOpt("major"),
    r.getString("language"))
}

def toBrowsers(rows: A[Row]): Seq[Browser] = {
  rows.map(toBrowser)
}

def toTech(r: Row): Tech = {
  Tech(
    toBrowsers(r.getRows("browsers")),
    r.getArray[String]("devices"),
    r.getArray[String]("oss"))
}

def toTechOpt(r: Row): Option[Tech] = {
  Option(r).map(toTech)
}

def toPerson(r: Row): Person = {
  Person(
    r.getString("userId"),
    toTechOpt(r.getRow("tech")),
    r.getStringOpt("geographic"))
}
df.map(toPerson).collect().foreach(println)
implicit class RichRow(val r: Row) extends AnyVal {

    def getBoolean(n: String) = r.getAs[Boolean](n)
    def getBooleanOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getBoolean(n))
    }

    def getString(n: String) = r.getAs[String](n)
    def getStringOpt(n: String) = Option(r.getString(n))

    def getLong(n: String) = r.getAs[Long](n)
    def getLongOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getLong(n))
    } 

    def getInt(n: String) = r.getAs[Int](n)
    def getIntOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getInt(n))
    } 

    def getFloat(n: String) = r.getAs[Float](n)
    def getFloatOpt(n: String) = Try(r.getString(n)) match {
      case Success(_) => None
      case _ => Option(r.getFloat(n))
    }

    def getArray[T](n: String) = r.getAs[A[T]](n)

    def getRow(n: String) = r.getAs[Row](n)
    def getRows(n: String): A[Row] = r.getAs[A[Row]](n)
  }
}