Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/json/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在spark中读取一个json文件,开头带有垃圾字符_Json_Scala_Apache Spark_Dataframe - Fatal编程技术网

在spark中读取一个json文件,开头带有垃圾字符

在spark中读取一个json文件,开头带有垃圾字符,json,scala,apache-spark,dataframe,Json,Scala,Apache Spark,Dataframe,我有一个包含如下数据的文件: <1>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value11", "field21": "value12"} } <2>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value

我有一个包含如下数据的文件:

<1>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value11", "field21": "value12"} }
<2>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value21", "field2": "value22"} }
<3>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value31", "field2": "value32"} }
2019-03-20T20:59:59Z日报[102852]:{“ts”:“1553115599”,“数据”:{“field1”:“value11”,“field21”:“value12”}
2019-03-20T20:59:59Z daily_report.txt[102852]:{“ts”:“1553115599”,“数据”:{“field1”:“value21”,“field2”:“value22”}
2019-03-20T20:59:59Z daily_report.txt[102852]:{“ts”:“1553115599”,“数据”:{“field1”:“value31”,“field2”:“value32”}

通常在spark中,我只能执行
spark.read.json(“inputs.json”)
,但由于每行前面都有垃圾,所以我不能。是否有一种方法可以将前面的部分切掉,或者更好——将垃圾作为列包含在我的数据框架中?

您必须将数据作为
数据集[String]
读取,然后自己解析这些列。完成后,为
json
数据创建一个模式,并使用sparks内置的
from_json()
函数:

import org.apache.spark.sql.types._

val ds = spark.createDataset(Seq(
    "<1>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value11\", \"field2\": \"value12\"} }",
    "<2>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value21\", \"field2\": \"value22\"} }",
    "<3>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value31\", \"field2\": \"value32\"} }"
))

//val ds = spark.read.text("inputs.txt").as[String]
val schema = StructType(List(StructField("ts", StringType), StructField("data", StructType(List(StructField("field1", StringType), StructField("field2", StringType))))))

val df = ds.map(r => {
    val j = r.indexOf("{")-1
    (r.substring(0, j), r.substring(j, r.length))
}).toDF("garbage", "json")

df.withColumn("data", from_json($"json", schema)).select("garbage", "data").show(false)
使用模式:

root
 |-- garbage: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- ts: string (nullable = true)
 |    |-- data: struct (nullable = true)
 |    |    |-- field1: string (nullable = true)
 |    |    |-- field2: string (nullable = true)
如果您确实不需要
垃圾数据
数据,请使用您已经习惯的
spark.read.json()
数据集[String]
传递给它。这不需要定义模式,因为它将被推断为:

val data = spark.read.json(df.select("json").as[String])

您必须将数据作为
数据集[String]
读取,然后自己解析列。完成后,为
json
数据创建一个模式,并使用sparks内置的
from_json()
函数:

import org.apache.spark.sql.types._

val ds = spark.createDataset(Seq(
    "<1>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value11\", \"field2\": \"value12\"} }",
    "<2>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value21\", \"field2\": \"value22\"} }",
    "<3>2019-03-20T20:59:59Z daily_report.txt[102852]: { \"ts\": \"1553115599\", \"data\": {\"field1\": \"value31\", \"field2\": \"value32\"} }"
))

//val ds = spark.read.text("inputs.txt").as[String]
val schema = StructType(List(StructField("ts", StringType), StructField("data", StructType(List(StructField("field1", StringType), StructField("field2", StringType))))))

val df = ds.map(r => {
    val j = r.indexOf("{")-1
    (r.substring(0, j), r.substring(j, r.length))
}).toDF("garbage", "json")

df.withColumn("data", from_json($"json", schema)).select("garbage", "data").show(false)
使用模式:

root
 |-- garbage: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- ts: string (nullable = true)
 |    |-- data: struct (nullable = true)
 |    |    |-- field1: string (nullable = true)
 |    |    |-- field2: string (nullable = true)
如果您确实不需要
垃圾数据
数据,请使用您已经习惯的
spark.read.json()
数据集[String]
传递给它。这不需要定义模式,因为它将被推断为:

val data = spark.read.json(df.select("json").as[String])

另一种方法,使用示例JSON记录动态获取模式。 使用正则表达式函数regexp_extract()解析垃圾字符串

看看这个:

scala> val df = Seq(( """<1>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }"""),
     | ("""<2>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value21", "field2": "value22"} }"""),
     | ("""<3>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value31", "field2": "value32"} }""")).toDF("data_garb")
df: org.apache.spark.sql.DataFrame = [data_garb: string]

scala> val json_str = """{ "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }"""
json_str: String = { "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }

scala> val dfj = spark.read.json(Seq(json_str).toDS)
dfj: org.apache.spark.sql.DataFrame = [data: struct<field1: string, field2: string>, ts: string]

scala> dfj.schema
res44: org.apache.spark.sql.types.StructType = StructType(StructField(data,StructType(StructField(field1,StringType,true), StructField(field2,StringType,true)),true), StructField(ts,StringType,true))

scala> val df2=df.withColumn("newc",regexp_extract('data_garb,""".*?(\{.*)""",1)).withColumn("newc",from_json('newc,dfj.schema)).drop("data_garb")
df2: org.apache.spark.sql.DataFrame = [newc: struct<data: struct<field1: string, field2: string>, ts: string>]

scala> df2.show(false)
+--------------------------------+
|newc                            |
+--------------------------------+
|[[value11, value12], 1553115599]|
|[[value21, value22], 1553115599]|
|[[value31, value32], 1553115599]|
+--------------------------------+
也可以通过显式地提及嵌套字段来查询它们

scala> df2.select($"newc.ts",$"newc.data.field1",$"newc.data.field2").show(false)
+----------+-------+-------+
|ts        |field1 |field2 |
+----------+-------+-------+
|1553115599|value11|value12|
|1553115599|value21|value22|
|1553115599|value31|value32|
+----------+-------+-------+


scala>

另一种方法,使用示例JSON记录动态获取模式。 使用正则表达式函数regexp_extract()解析垃圾字符串

看看这个:

scala> val df = Seq(( """<1>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }"""),
     | ("""<2>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value21", "field2": "value22"} }"""),
     | ("""<3>2019-03-20T20:59:59Z daily_report.txt[102852]: { "ts": "1553115599", "data": {"field1": "value31", "field2": "value32"} }""")).toDF("data_garb")
df: org.apache.spark.sql.DataFrame = [data_garb: string]

scala> val json_str = """{ "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }"""
json_str: String = { "ts": "1553115599", "data": {"field1": "value11", "field2": "value12"} }

scala> val dfj = spark.read.json(Seq(json_str).toDS)
dfj: org.apache.spark.sql.DataFrame = [data: struct<field1: string, field2: string>, ts: string]

scala> dfj.schema
res44: org.apache.spark.sql.types.StructType = StructType(StructField(data,StructType(StructField(field1,StringType,true), StructField(field2,StringType,true)),true), StructField(ts,StringType,true))

scala> val df2=df.withColumn("newc",regexp_extract('data_garb,""".*?(\{.*)""",1)).withColumn("newc",from_json('newc,dfj.schema)).drop("data_garb")
df2: org.apache.spark.sql.DataFrame = [newc: struct<data: struct<field1: string, field2: string>, ts: string>]

scala> df2.show(false)
+--------------------------------+
|newc                            |
+--------------------------------+
|[[value11, value12], 1553115599]|
|[[value21, value22], 1553115599]|
|[[value31, value32], 1553115599]|
+--------------------------------+
也可以通过显式地提及嵌套字段来查询它们

scala> df2.select($"newc.ts",$"newc.data.field1",$"newc.data.field2").show(false)
+----------+-------+-------+
|ts        |field1 |field2 |
+----------+-------+-------+
|1553115599|value11|value12|
|1553115599|value21|value22|
|1553115599|value31|value32|
+----------+-------+-------+


scala>