Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/json/13.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在Python的spark中将json字符串转换为数据帧_Json_Apache Spark_Dataframe_Pyspark_Databricks - Fatal编程技术网

在Python的spark中将json字符串转换为数据帧

在Python的spark中将json字符串转换为数据帧,json,apache-spark,dataframe,pyspark,databricks,Json,Apache Spark,Dataframe,Pyspark,Databricks,(Databricks上的Apache Spark版本2.3.1) 你好,我有一个JSON转储,看起来像这样 [{"standings": {"visitorteam_position": 1, "localteam_position": 1}, "season_id": 892, "pitch": null, "commentaries": null, "id": 10342083, "venue_id": 273277, "formations": {"localteam_formation

(Databricks上的Apache Spark版本2.3.1)

你好,我有一个JSON转储,看起来像这样

[{"standings": {"visitorteam_position": 1, "localteam_position": 1}, "season_id": 892, "pitch": null, "commentaries": null, "id": 10342083, "venue_id": 273277, "formations": {"localteam_formation": null, "visitorteam_formation": null}, "aggregate_id": null, "round_id": null, "visitorteam_id": 18647, "winning_odds_calculated": false, "deleted": false, "coaches": {"localteam_coach_id": 472158, "visitorteam_coach_id": 474616}, "attendance": null, "scores": {"ft_score": null, "visitorteam_score": 0, "et_score": null, "localteam_pen_score": null, "visitorteam_pen_score": null, "localteam_score": 0, "ht_score": null}, "referee_id": 18783, "stage_id": 1728, "weather_report": null, "league_id": 732, "localteam_id": 15251, "time": {"status": "NS", "starting_at": {"date": "2018-07-06", "date_time": "2018-07-06 14:00:00", "timezone": "UTC", "timestamp": 1530885600, "time": "14:00:00"}, "extra_minute": null, "injury_time": null, "second": null, "added_time": null, "minute": null}, "group_id": null}, {"standings": {"visitorteam_position": 1, "localteam_position": 1}, "season_id": 892, "pitch": null, "commentaries": null, "id": 10344350, "venue_id": 8869, "formations": {"localteam_formation": null, "visitorteam_formation": null}, "aggregate_id": null, "round_id": null, "visitorteam_id": 18743, "winning_odds_calculated": false, "deleted": false, "coaches": {"localteam_coach_id": 474720, "visitorteam_coach_id": 474796}, "attendance": null, "scores": {"ft_score": null, "visitorteam_score": 0, "et_score": null, "localteam_pen_score": null, "visitorteam_pen_score": null, "localteam_score": 0, "ht_score": null}, "referee_id": 16781, "stage_id": 1728, "weather_report": null, "league_id": 732, "localteam_id": 18704, "time": {"status": "NS", "starting_at": {"date": "2018-07-06", "date_time": "2018-07-06 18:00:00", "timezone": "UTC", "timestamp": 1530900000, "time": "18:00:00"}, "extra_minute": null, "injury_time": null, "second": null, "added_time": null, "minute": null}, "group_id": null}]
我试图将其直接从变量转换为数据帧,而不是JSON文件上传;主要是因为我从对API的get请求中获取JSON数据

这是我的转换代码-

countries = spark.read.option("multiline", "true").json(json.dumps(ts)).show(false)
给我这个错误,请给我指出正确的方向。我四处查看了一下,但我只看到了Scala的解决方案。正在寻找相同的Python修复程序

IllegalArgumentException:u'java.net.URISyntaxException:相对路径 在绝对URI中: “[{\”排名\"20%20%5%5%5%22%5%5%5%22%22%22%22%22%22%20%5%22%22%22%22%22%22%5%22%5%22%5%5%5%5%5%22%5%5%5%5%5%5%5%5%5%22%5%5%5%5%22%5%5%5%22%5%22%5%22%5%22%5%22%当地团队团队,当地团队,当地团队,当地团队,地方团队,位置,位置,位置,5%22%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%522%22%22%20%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%C%22%C%22圆形,id%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%C%22%C%22圆形C C C%22圆形C C C C C C C%22圆形,圆形,C C C C C C C C C C%22圆形,圆形,C C C C C C C C C C C C%22圆形,C C C C C C C C C C C C C C C%22圆形,圆形,C C C C C C C C C C C C C C C C C%22圆形,C C C C C C C C C C C C C C C,C C C C C C C C C C C C C C C C C C C C C C C 5C%220%7%5%22%22%22%22%22%20%20%7%7%7%5%22%22%22%20%20%5%5%5%5%22%20%5%5%5%5%5%5%22%5%5%5%22%22%22%22%22%20%20%7%7%7%7%7%5%5%5%5%22%22%22%22%22%22%22%20%22%22%22%22%22%20%20%20%20%22%22%22%22%20%22%22%22%22%22%22%20%20%20%20%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%22%5%22%22%22%22%22%22%22%22%22%22%22 20%5C%22阶段id%5%5%C C%22%22%22%5%5%C%22%22%22%5%5%5%C%22%22%5%5%C%22%22%5%5%C%22%5%5%C%22%22%5%5%5%5%5%5%5%22%22%22%5%5%5%5%5%5%22%5%5%22%5%22%5%22%22%22%22%22%22%20%20%5%5%5%5%5%5%5%5%5%22%5%22%22%22%22%7%22%以下以下以下以下以下以下以下以下以下以下,10%7%8%7%8%8%5%C C C C C C C C C C C%10%联盟联盟联盟联盟联盟联盟联盟联盟,身份身份身份身份身份身份身份身份身份身份身份身份id id id id id id id id%5%5%5%5%5%5 00:00%5C%22,%20%5C%22时区%5C%22:%20%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%22时间%5%5%22%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%22时间%5%5%5%5%5%22%5%5%5%22时间%5%5%22时间%5%5%5%5%5%5%5%5%22时间%5%5%5%5%5%5%5%5%5%5%5%5%5时间%5%5%5%5%5%5%5%5%5%5时间%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5C%22:%20%7B%5C%22visitorteam_位置%5C%22:%201、%20%5C%22本地团队_位置%5C%22:%201%7D、%20%5C%22季节_id%5C%22:%20892、%20%5C%22音高%5C%22:%20null、%20%5C%22评论%5C%22:%20null、%20%5C%22id%5C%22:%2010344350、%20%5C%22场馆_id%5C%22:%208869、%20%5C%225c%22队形%22:%20%5C%22%225c队形%22%205c%22%205c队形:%20null、%,%20%5%5%5%5%5%5%5%5%5%5%5%5%5%22%22%22%22%22%20%5%22%20%5%5%5%5%C%22%5%5%22%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%22%22%22%22%22%C%22%22%22%5%5%22%5%5%5%5%5%5%5%22%22%5%22%5%5%22%22%22%22%22%20%20%5%5%5%20%22%22%22%5%5%5%5%5%5%5%5%5%5%5%5%8%5%5%22%22%22%22%22%22%22%22%22%22%22%22%22%22%22%22%22%20%20%22%2220%20%5C%5%5%5%5%5%5%5%5%5%C%22%C%22%C%22 ViViVisiToToWeWeC%22%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%C%C%22%22%22%22%5%C%C%C%22%ViViVis VisiToVis Vis Vis Vis Vis ViViVis)Vis旅游团队团队团队团队团队团队团队团队团队团队团队团队的学习学习学习学习学习学习成绩,团队团队学习学习分数分数分数分数分数,10%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5%5.团队团队团队团队团队5C%22:%201728,%20%522%5%5%5%5%C%22%5%5%C%C%C%C%C%C%22%C%C%22%22%20%5%5%C%22%22%20%5%C%22%5%C%22%C%C%22%C%C%C%C%C%C%C%C%C%C%C%22%C%C%22%C%C%C%22%C%C%C%C%C%C%22%C%C%C%C%C%C%C%C%5%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C%C C%22时区%5C%22:%20%5C%22UTC%5C%22%20%5C%22时间戳%5C%22:%201530900000、%20%5C%22时间%5C%22:%20%5C%2218:00:00%5C%22%7D、%20%5C%22额外分钟%5C%22:%20null、%20%5C%22受伤时间%5C%22:%20null、%20%5C%22秒%5C%22:%20null、%20%5C%22追加时间%5C%22:%20null、%20%5C%22%22%22minute%5C%22:%20null、%20%5C%22%7D、%20%5C%22%

输出

打印(ts)

打印(json.dumps(ts))

提前谢谢

注-这里是关于如何使用Scala的链接-

你说的

我试图将其直接从变量转换为数据帧,而不是JSON文件上载;主要是因为我从一个get请求到一个API获取JSON数据

所以我假设ts是一个变量

ts = """[{"standings": {"visitorteam_position": 1, "localteam_position": 1}, "season_id": 892, "pitch": null, "commentaries": null, "id": 10342083, "venue_id": 273277, "formations": {"localteam_formation": null, "visitorteam_formation": null}, "aggregate_id": null, "round_id": null, "visitorteam_id": 18647, "winning_odds_calculated": false, "deleted": false, "coaches": {"localteam_coach_id": 472158, "visitorteam_coach_id": 474616}, "attendance": null, "scores": {"ft_score": null, "visitorteam_score": 0, "et_score": null, "localteam_pen_score": null, "visitorteam_pen_score": null, "localteam_score": 0, "ht_score": null}, "referee_id": 18783, "stage_id": 1728, "weather_report": null, "league_id": 732, "localteam_id": 15251, "time": {"status": "NS", "starting_at": {"date": "2018-07-06", "date_time": "2018-07-06 14:00:00", "timezone": "UTC", "timestamp": 1530885600, "time": "14:00:00"}, "extra_minute": null, "injury_time": null, "second": null, "added_time": null, "minute": null}, "group_id": null}, {"standings": {"visitorteam_position": 1, "localteam_position": 1}, "season_id": 892, "pitch": null, "commentaries": null, "id": 10344350, "venue_id": 8869, "formations": {"localteam_formation": null, "visitorteam_formation": null}, "aggregate_id": null, "round_id": null, "visitorteam_id": 18743, "winning_odds_calculated": false, "deleted": false, "coaches": {"localteam_coach_id": 474720, "visitorteam_coach_id": 474796}, "attendance": null, "scores": {"ft_score": null, "visitorteam_score": 0, "et_score": null, "localteam_pen_score": null, "visitorteam_pen_score": null, "localteam_score": 0, "ht_score": null}, "referee_id": 16781, "stage_id": 1728, "weather_report": null, "league_id": 732, "localteam_id": 18704, "time": {"status": "NS", "starting_at": {"date": "2018-07-06", "date_time": "2018-07-06 18:00:00", "timezone": "UTC", "timestamp": 1530900000, "time": "18:00:00"}, "extra_minute": null, "injury_time": null, "second": null, "added_time": null, "minute": null}, "group_id": null}]"""
现在,
json.dumps(ts)
将给您一个字符串,
.json(json.dumps(ts))
json.dumps(ts)
作为一个路径,这就是错误消息建议您的

IllegalArgumentException:u'java.net.URISyntaxException:absolute URI中的相对路径:“[{\'standings\”:%20%7B%5C%22访问团队位置%5C%22:%201、%20%5C%22本地团队位置%5C%22:%201%7D、%20%5C%22季度id%5C%22:%20892、%5C

API文档说明如下

…:param path:string表示JSON数据集的路径、路径列表或存储JSON对象的字符串RDD……。

因此,如果您想使用变量
ts
,那么,正如API文档所说,您必须将字符串
json.dumps(ts)
转换为
RDD
as

tsRDD = sc.parallelize([ts])
df = spark.read.option('multiline', "true").json(tsRDD)
哪个应该给出正确的数据帧

+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+
|aggregate_id|attendance|coaches         |commentaries|deleted|formations|group_id|id      |league_id|localteam_id|pitch|referee_id|round_id|scores      |season_id|stage_id|standings|time                                                                    |venue_id|visitorteam_id|weather_report|winning_odds_calculated|
+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+
|null        |null      |[472158, 474616]|null        |false  |[,]       |null    |10342083|732      |15251       |null |18783     |null    |[,,,, 0,, 0]|892      |1728    |[1, 1]   |[,,,,, [2018-07-06, 2018-07-06 14:00:00, 14:00:00, 1530885600, UTC], NS]|273277  |18647         |null          |false                  |
|null        |null      |[474720, 474796]|null        |false  |[,]       |null    |10344350|732      |18704       |null |16781     |null    |[,,,, 0,, 0]|892      |1728    |[1, 1]   |[,,,,, [2018-07-06, 2018-07-06 18:00:00, 18:00:00, 1530900000, UTC], NS]|8869    |18743         |null          |false                  |
+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+

root
 |-- aggregate_id: string (nullable = true)
 |-- attendance: string (nullable = true)
 |-- coaches: struct (nullable = true)
 |    |-- localteam_coach_id: long (nullable = true)
 |    |-- visitorteam_coach_id: long (nullable = true)
 |-- commentaries: string (nullable = true)
 |-- deleted: boolean (nullable = true)
 |-- formations: struct (nullable = true)
 |    |-- localteam_formation: string (nullable = true)
 |    |-- visitorteam_formation: string (nullable = true)
 |-- group_id: string (nullable = true)
 |-- id: long (nullable = true)
 |-- league_id: long (nullable = true)
 |-- localteam_id: long (nullable = true)
 |-- pitch: string (nullable = true)
 |-- referee_id: long (nullable = true)
 |-- round_id: string (nullable = true)
 |-- scores: struct (nullable = true)
 |    |-- et_score: string (nullable = true)
 |    |-- ft_score: string (nullable = true)
 |    |-- ht_score: string (nullable = true)
 |    |-- localteam_pen_score: string (nullable = true)
 |    |-- localteam_score: long (nullable = true)
 |    |-- visitorteam_pen_score: string (nullable = true)
 |    |-- visitorteam_score: long (nullable = true)
 |-- season_id: long (nullable = true)
 |-- stage_id: long (nullable = true)
 |-- standings: struct (nullable = true)
 |    |-- localteam_position: long (nullable = true)
 |    |-- visitorteam_position: long (nullable = true)
 |-- time: struct (nullable = true)
 |    |-- added_time: string (nullable = true)
 |    |-- extra_minute: string (nullable = true)
 |    |-- injury_time: string (nullable = true)
 |    |-- minute: string (nullable = true)
 |    |-- second: string (nullable = true)
 |    |-- starting_at: struct (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- date_time: string (nullable = true)
 |    |    |-- time: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)
 |    |    |-- timezone: string (nullable = true)
 |    |-- status: string (nullable = true)
 |-- venue_id: long (nullable = true)
 |-- visitorteam_id: long (nullable = true)
 |-- weather_report: string (nullable = true)
 |-- winning_odds_calculated: boolean (nullable = true)
或者您可以将变量保存在文件中并使用

df = spark.read.option('multiline', "true").json(path to the file)
哪个和上面的建议一样完美


我希望答案是有帮助的

Hi Ramesh,非常感谢您的回复。我确实尝试了serialize函数,但它返回了损坏的数据,因为我没有在ts周围添加方括号作为参数。考虑到ts已经是JSON对象的字符串化列表,我们这样做有什么原因吗?我需要查看数据类型和数据我已经在上面的问题中添加了数据转储,您能检查一下吗?如果ts的格式与您发布的格式相同,那么json.dumps(ts)将有字符串json
+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+
|aggregate_id|attendance|coaches         |commentaries|deleted|formations|group_id|id      |league_id|localteam_id|pitch|referee_id|round_id|scores      |season_id|stage_id|standings|time                                                                    |venue_id|visitorteam_id|weather_report|winning_odds_calculated|
+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+
|null        |null      |[472158, 474616]|null        |false  |[,]       |null    |10342083|732      |15251       |null |18783     |null    |[,,,, 0,, 0]|892      |1728    |[1, 1]   |[,,,,, [2018-07-06, 2018-07-06 14:00:00, 14:00:00, 1530885600, UTC], NS]|273277  |18647         |null          |false                  |
|null        |null      |[474720, 474796]|null        |false  |[,]       |null    |10344350|732      |18704       |null |16781     |null    |[,,,, 0,, 0]|892      |1728    |[1, 1]   |[,,,,, [2018-07-06, 2018-07-06 18:00:00, 18:00:00, 1530900000, UTC], NS]|8869    |18743         |null          |false                  |
+------------+----------+----------------+------------+-------+----------+--------+--------+---------+------------+-----+----------+--------+------------+---------+--------+---------+------------------------------------------------------------------------+--------+--------------+--------------+-----------------------+

root
 |-- aggregate_id: string (nullable = true)
 |-- attendance: string (nullable = true)
 |-- coaches: struct (nullable = true)
 |    |-- localteam_coach_id: long (nullable = true)
 |    |-- visitorteam_coach_id: long (nullable = true)
 |-- commentaries: string (nullable = true)
 |-- deleted: boolean (nullable = true)
 |-- formations: struct (nullable = true)
 |    |-- localteam_formation: string (nullable = true)
 |    |-- visitorteam_formation: string (nullable = true)
 |-- group_id: string (nullable = true)
 |-- id: long (nullable = true)
 |-- league_id: long (nullable = true)
 |-- localteam_id: long (nullable = true)
 |-- pitch: string (nullable = true)
 |-- referee_id: long (nullable = true)
 |-- round_id: string (nullable = true)
 |-- scores: struct (nullable = true)
 |    |-- et_score: string (nullable = true)
 |    |-- ft_score: string (nullable = true)
 |    |-- ht_score: string (nullable = true)
 |    |-- localteam_pen_score: string (nullable = true)
 |    |-- localteam_score: long (nullable = true)
 |    |-- visitorteam_pen_score: string (nullable = true)
 |    |-- visitorteam_score: long (nullable = true)
 |-- season_id: long (nullable = true)
 |-- stage_id: long (nullable = true)
 |-- standings: struct (nullable = true)
 |    |-- localteam_position: long (nullable = true)
 |    |-- visitorteam_position: long (nullable = true)
 |-- time: struct (nullable = true)
 |    |-- added_time: string (nullable = true)
 |    |-- extra_minute: string (nullable = true)
 |    |-- injury_time: string (nullable = true)
 |    |-- minute: string (nullable = true)
 |    |-- second: string (nullable = true)
 |    |-- starting_at: struct (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- date_time: string (nullable = true)
 |    |    |-- time: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)
 |    |    |-- timezone: string (nullable = true)
 |    |-- status: string (nullable = true)
 |-- venue_id: long (nullable = true)
 |-- visitorteam_id: long (nullable = true)
 |-- weather_report: string (nullable = true)
 |-- winning_odds_calculated: boolean (nullable = true)
df = spark.read.option('multiline', "true").json(path to the file)