Scala 使用Spark 1.4对嵌套json数据拆分dataframe列的内容

Scala 使用Spark 1.4对嵌套json数据拆分dataframe列的内容,scala,apache-spark,Scala,Apache Spark,我在使用Spark 1.4拆分数据帧列的内容时遇到问题。数据帧是通过读取嵌套的复杂json文件创建的。我使用了df.explode,但不断收到错误消息。json文件格式如下所示: [ { "neid":{ }, "mi":{ "mts":"20100609071500Z", "gp":"900", "tMOID":"Aal2Ap",

我在使用Spark 1.4拆分数据帧列的内容时遇到问题。数据帧是通过读取嵌套的复杂json文件创建的。我使用了df.explode,但不断收到错误消息。json文件格式如下所示:

[   
    {   
        "neid":{  }, 
        "mi":{   
            "mts":"20100609071500Z", 
            "gp":"900", 
            "tMOID":"Aal2Ap", 
            "mt":[  ], 
            "mv":[   
                {   
                    "moid":"ManagedElement=1,TransportNetwork=1,Aal2Sp=1,Aal2Ap=r1552q", 
                    "r": 
                    [ 
                     1, 
                     2, 
                     5 
                     ] 
                }, 
                { 
                    "moid":"ManagedElement=1,TransportNetwork=1,Aal2Sp=1,Aal2Ap=r1542q", 
                    "r": 
                    [ 
                     1, 
                     2, 
                     5 
                     ] 
 } 
            ] 
        } 
    }, 
    {   
        "neid":{   
            "neun":"RC003", 
            "nedn":"SubNetwork=ONRM_RootMo_R,SubNetwork=RC003,MeContext=RC003", 
            "nesw":"CP90831_R9YC/11" 
        }, 
        "mi":{   
            "mts":"20100609071500Z", 
            "gp":"900", 
            "tMOID":"PlugInUnit", 
            "mt":"pmProcessorLoad", 
            "mv":[   
                {   
                    "moid":"ManagedElement=1,Equipment=1,Subrack=MS,Slot=6,PlugInUnit=1", 
                   "r": 
                     [ 1, 2, 5 
                     ] 
                }, 
                {   
                    "moid":"ManagedElement=1,Equipment=1,Subrack=ES-1,Slot=1,PlugInUnit=1", 
                   "r": 
                  [ 1, 2, 5 
                     ] 
                } 
            ] 
        } 
    } 
]
val df1=df.select("mi.mv")
df1.show()

df1.explode...
我使用以下代码加载Spark 1.4

scala> val df = sqlContext.read.json("/Users/xx/target/statsfile.json") 

scala> df.show() 
+--------------------+--------------------+ 
|                  mi|                neid| 
+--------------------+--------------------+ 
|[900,["pmEs","pmS...|[SubNetwork=ONRM_...| 
|[900,["pmIcmpInEr...|[SubNetwork=ONRM_...| 
|[900,pmUnsuccessf...|[SubNetwork=ONRM_...| 
|[900,["pmBwErrBlo...|[SubNetwork=ONRM_...| 
|[900,["pmSctpStat...|[SubNetwork=ONRM_...| 
|[900,["pmLinkInSe...|[SubNetwork=ONRM_...| 
|[900,["pmGrFc","p...|[SubNetwork=ONRM_...| 
|[900,["pmReceived...|[SubNetwork=ONRM_...| 
|[900,["pmIvIma","...|[SubNetwork=ONRM_...| 
|[900,["pmEs","pmS...|[SubNetwork=ONRM_...| 
|[900,["pmEs","pmS...|[SubNetwork=ONRM_...| 
|[900,["pmExisOrig...|[SubNetwork=ONRM_...| 
|[900,["pmHDelayVa...|[SubNetwork=ONRM_...| 
|[900,["pmReceived...|[SubNetwork=ONRM_...| 
|[900,["pmReceived...|[SubNetwork=ONRM_...| 
|[900,["pmAverageR...|[SubNetwork=ONRM_...| 
|[900,["pmDchFrame...|[SubNetwork=ONRM_...| 
|[900,["pmReceived...|[SubNetwork=ONRM_...| 
|[900,["pmNegative...|[SubNetwork=ONRM_...| 
|[900,["pmUsedTbsQ...|[SubNetwork=ONRM_...| 
+--------------------+--------------------+ 
scala> df.printSchema() 
root 
 |-- mi: struct (nullable = true) 
 |    |-- gp: long (nullable = true) 
 |    |-- mt: string (nullable = true) 
 |    |-- mts: string (nullable = true) 
 |    |-- mv: string (nullable = true) 
 |-- neid: struct (nullable = true) 
 |    |-- nedn: string (nullable = true) 
 |    |-- nesw: string (nullable = true) 
 |    |-- neun: string (nullable = true) 

scala> val df1=df.select("mi.mv").show() 
+--------------------+ 
|                  mv| 
+--------------------+ 
|[{"r":[0,0,0],"mo...| 
|{"r":[0,4,0,4],"m...| 
|{"r":5,"moid":"Ma...| 
|[{"r":[2147483647...| 
|{"r":[225,1112986...| 
|[{"r":[83250,0,0,...| 
|[{"r":[1,2,529982...| 
|[{"r":[26998564,0...| 
|[{"r":[0,0,0,0,0,...| 
|[{"r":[0,0,0],"mo...| 
|[{"r":[0,0,0],"mo...| 
|{"r":[0,0,0,0,0,0...| 
|{"r":[0,0,1],"moi...| 
|{"r":[4587,4587],...| 
|[{"r":[180,180],"...| 
|[{"r":["0,0,0,0,0...| 
|{"r":[0,35101,0,0...| 
|[{"r":["0,0,0,0,0...| 
|[{"r":[0,1558],"m...| 
|[{"r":["7484,4870...| 
+--------------------+ 

scala> df1.explode("mv","mvnew")(mv: String => mv.split(",")) 
<console>:1: error: ')' expected but '(' found. 
       df1.explode("mv","mvnew")(mv: String => mv.split(",")) 
                                                       ^ 
<console>:1: error: ';' expected but ')' found. 
       df1.explode("mv","mvnew")(mv: String => mv.split(",")) 
scala>val df=sqlContext.read.json(“/Users/xx/target/statsfile.json”)
scala>df.show()
+--------------------+--------------------+ 
|米涅德
+--------------------+--------------------+ 
|[900,[“pmEs”,“pmS…|[子网=ONRM|…”
|[900,[“pmIcmpInEr…|[SubNetwork=ONRM|…”
|[900,pmUnsuccessf…|[SubNetwork=ONRM|…|
|[900,[“pmBwErrBlo…|[子网=ONRM|…”
|[900,[“pmSctpStat…|[子网=ONRM|…”
|[900,[“pmLinkInSe…|[子网=ONRM|…|
|[900,[“pmGrFc”,“p.|[子网=ONRM|…”
|[900,[“pmReceived…|[子网=ONRM|…”
|[900,[“pmIvIma”,“..|[子网=ONRM|…”
|[900,[“pmEs”,“pmS…|[子网=ONRM|…”
|[900,[“pmEs”,“pmS…|[子网=ONRM|…”
|[900,[“pmExisOrig…|[子网=ONRM|…”
|[900,[“pmHDelayVa…|[子网=ONRM|…|
|[900,[“pmReceived…|[子网=ONRM|…”
|[900,[“pmReceived…|[子网=ONRM|…”
|[900,[“pmAverageR…|[子网=ONRM|…”
|[900,[“pmdcframe…|[子网=ONRM|…|
|[900,[“pmReceived…|[子网=ONRM|…”
|[900,[“pmNegative…|[子网=ONRM|…”
|[900,[“pmUsedTbsQ…|[子网=ONRM|…”
+--------------------+--------------------+ 
scala>df.printSchema()
根
|--mi:struct(nullable=true)
||--gp:long(nullable=true)
||--mt:string(nullable=true)
||--mts:string(nullable=true)
||--mv:string(nullable=true)
|--neid:struct(nullable=true)
||--nedn:string(nullable=true)
||--nesw:string(nullable=true)
||--neun:string(nullable=true)
scala>val df1=df.select(“mi.mv”).show()
+--------------------+ 
|mv|
+--------------------+ 
|[{“r”:[0,0,0],“mo.|
|{“r”:[0,4,0,4],“m…”
|{“r”:5,“moid”:“Ma…”
|[{“r”:[2147483647…”
|{“r”:[225112986…|
|[{“r”:[83250,0,0,…]
|[{“r”:[1,2529982…|
|[{“r”:[26998564,0…”
|[{“r”:[0,0,0,0,0,…]
|[{“r”:[0,0,0],“mo.|
|[{“r”:[0,0,0],“mo.|
|{“r”:[0,0,0,0,0,0…”
|{“r”:[0,0,1],“moi…”
|{“r”:[45874587],…|
|[{“r”:[180180],“…”
|[{“r”:[“0,0,0,0,0…”
|{“r”:[035101,0,0…|
|[{“r”:[“0,0,0,0,0…”
|[{“r”:[01558],“m.|
|[{“r”:[“74844870…”
+--------------------+ 
scala>df1.explode(“mv”,“mvnew”)(mv:String=>mv.split(“,”))
:1:错误:“)”应为“”,但找到了“(”。
df1.explode(“mv”,“mvnew”)(mv:String=>mv.split(“,”))
^ 
:1:错误:';'应为',但找到'。
df1.explode(“mv”,“mvnew”)(mv:String=>mv.split(“,”))

我做错什么了吗?我需要在单独的列中提取mi.mv下的数据,以便应用一些转换。

删除
字符串
键入
mv
,如下所示:

df1.explode("mv","mvnew")(mv => mv.split(","))
因为键入已在
分解定义中

更新(见评论) 然后您会得到一个不同的错误,
df1
的类型为
Unit
而不是
DataFrame
。您可以按如下方式修复此错误:

[   
    {   
        "neid":{  }, 
        "mi":{   
            "mts":"20100609071500Z", 
            "gp":"900", 
            "tMOID":"Aal2Ap", 
            "mt":[  ], 
            "mv":[   
                {   
                    "moid":"ManagedElement=1,TransportNetwork=1,Aal2Sp=1,Aal2Ap=r1552q", 
                    "r": 
                    [ 
                     1, 
                     2, 
                     5 
                     ] 
                }, 
                { 
                    "moid":"ManagedElement=1,TransportNetwork=1,Aal2Sp=1,Aal2Ap=r1542q", 
                    "r": 
                    [ 
                     1, 
                     2, 
                     5 
                     ] 
 } 
            ] 
        } 
    }, 
    {   
        "neid":{   
            "neun":"RC003", 
            "nedn":"SubNetwork=ONRM_RootMo_R,SubNetwork=RC003,MeContext=RC003", 
            "nesw":"CP90831_R9YC/11" 
        }, 
        "mi":{   
            "mts":"20100609071500Z", 
            "gp":"900", 
            "tMOID":"PlugInUnit", 
            "mt":"pmProcessorLoad", 
            "mv":[   
                {   
                    "moid":"ManagedElement=1,Equipment=1,Subrack=MS,Slot=6,PlugInUnit=1", 
                   "r": 
                     [ 1, 2, 5 
                     ] 
                }, 
                {   
                    "moid":"ManagedElement=1,Equipment=1,Subrack=ES-1,Slot=1,PlugInUnit=1", 
                   "r": 
                  [ 1, 2, 5 
                     ] 
                } 
            ] 
        } 
    } 
]
val df1=df.select("mi.mv")
df1.show()

df1.explode...

那是因为
show()
返回一个类型为
Unit
的值,您以前试图在该值上运行
explode
。上述操作确保您在实际
数据帧上运行
explode

我知道该值很旧,但我有一个解决方案,对正在搜索此问题解决方案的人非常有用(就像我一样).我一直在使用scala 2.10.4构建的spark 1.5

这似乎只是一个格式问题。我复制了上面所有的错误,对我有效的是

df1.explode("mv","mvnew"){mv: String => mv.asInstanceOf[String].split(",")} 
我不完全理解为什么我需要将mv定义为一个字符串两次,如果有人愿意解释的话,我会感兴趣的,但这应该可以让人分解一个数据帧列

还有一个问题。如果你在分割一个特殊字符(比如“?”),你需要转义它两次。所以在上面,在“?”上分割会得到:

df1.explode("mv","mvnew"){mv: String => mv.asInstanceOf[String].split("\\?")} 

我希望这对某些人有所帮助。

我尝试删除字符串类型,但仍然不断出现错误:scala>df1.explode(“mv”,“mvnew”)(mv=>mv.split(“,”):28:错误:value explode不是单元df1的成员。explode(“mv”,“mvnew”)(mv=>mv.split(“,”)谢谢。创建有效数据帧的上述更正-df1确保我在有效数据帧上运行explode。但是,我仍然得到错误------scala>val df1=df.select(“mi.mv”)df1:org.apache.spark.sql.dataframe=[mv:string]scala>df1.explode(“mv”,“mvnew”)(mv=>mv.split(“,”):28:错误:值拆分不是Nothing df1的成员。分解(“mv”,“mvnew”)(mv=>mv.split(“,”))