Apache spark Pyspark在写入磁盘时丢失数据

Apache spark Pyspark在写入磁盘时丢失数据,apache-spark,hadoop,pyspark,apache-spark-sql,Apache Spark,Hadoop,Pyspark,Apache Spark Sql,下面是我的代码片段。将数据写入磁盘后。计数不匹配。我尝试过重新分区(1),但仍然丢失了大量数据 df_reddit_max_utc = df.select('subreddit', 'created_utc').groupby('subreddit').agg(f.max('created_utc').alias('max_utc_created')) df_reddit_max_utc.count() # 3519289 编辑: 原始数据集 {"all_awardings&quo

下面是我的代码片段。将数据写入磁盘后。计数不匹配。我尝试过重新分区(1),但仍然丢失了大量数据

df_reddit_max_utc = df.select('subreddit', 'created_utc').groupby('subreddit').agg(f.max('created_utc').alias('max_utc_created'))

df_reddit_max_utc.count()
# 3519289
编辑:

原始数据集

{"all_awardings":[],"allow_live_comments":false,"archived":false,"author":"yutacustoms","author_created_utc":1469348679,"author_flair_background_color":null,"author_flair_css_class":null,"author_flair_richtext":[],"author_flair_template_id":null,"author_flair_text":null,"author_flair_text_color":null,"author_flair_type":"text","author_fullname":"t2_zrj8k","author_patreon_flair":false,"can_gild":true,"can_mod_post":false,"category":null,"content_categories":null,"contest_mode":false,"created_utc":1564681493,"discussion_type":null,"distinguished":null,"domain":"etsy.com","edited":false,"event_end":1564686000.0,"event_is_live":false,"event_start":1564671600.0,"gilded":0,"gildings":{},"hidden":false,"id":"bvzcw9","is_crosspostable":true,"is_meta":false,"is_original_content":false,"is_reddit_media_domain":false,"is_robot_indexable":true,"is_self":false,"is_video":false,"link_flair_background_color":"","link_flair_css_class":null,"link_flair_richtext":[],"link_flair_text":null,"link_flair_text_color":"dark","link_flair_type":"text","locked":false,"media":null,"media_embed":{},"media_only":false,"no_follow":true,"num_comments":0,"num_crossposts":0,"over_18":false,"parent_whitelist_status":null,"permalink":"\/r\/yutacustoms\/comments\/bvzcw9\/end_eclipsedarksouls_art_original_art_oneofakind\/","pinned":false,"post_hint":"link","preview":{"enabled":false,"images":[{"id":"SfR2A2d_FDlAeCklFc64mCo7sBT_zvnnC7rgaaIHbQw","resolutions":[{"height":135,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=108&crop=smart&auto=webp&s=7daaffa01308287fc5049885fee97259383ed6a0","width":108},{"height":270,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=216&crop=smart&auto=webp&s=821c886a6bb6d2d5ee9e512e65685e6b9034df15","width":216},{"height":400,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=320&crop=smart&auto=webp&s=3f2a01a19976f8b809b4516c09743a0c31c2f800","width":320}],"source":{"height":713,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?auto=webp&s=91fcda37e4fd9a63c70910de753f4857421742e0","width":570},"variants":{}}]},"pwls":null,"quarantine":false,"removal_reason":null,"retrieved_on":1565755310,"score":1,"secure_media":null,"secure_media_embed":{},"selftext":"","send_replies":false,"spoiler":false,"stickied":false,"subreddit":"yutacustoms","subreddit_id":"t5_4br13","subreddit_name_prefixed":"r\/yutacustoms","subreddit_subscribers":104,"subreddit_type":"public","suggested_sort":null,"thumbnail":"https:\/\/a.thumbs.redditmedia.com\/R9ZVe9Fxk6mGG0-U8wdLBt6XTm1HQQae7LmProxlod8.jpg","thumbnail_height":140,"thumbnail_width":140,"title":"End Eclipse-DarkSouls Art\/\/ original art\/\/ oneofakind\/\/","total_awards_received":0,"url":"https:\/\/www.etsy.com\/ca\/listing\/690795754\/end-eclipse-darksouls-art-original-art?ref=shop_home_active_24&frs=1","whitelist_status":null,"wls":null}
df_reddit_max_utc数据集:

如何读取数据集: 因为使用spark.read.json加载数据需要很多时间,所以我首先使用一个小的_dataset.bz2来获取完整目录的模式。然后我使用模式读取完整的目录

df_schema_retrieve = spark.read.json('/corpus/small_dataset.bz2')

## load schema to json file
sch = df_schema_retrieve.schema.json()

with open('subreddit_schema.json', 'w') as file:
    file.write(sch)


你能提供你的数据集的样本吗?我认为您的数据集很有可能以某种方式损坏,或者混乱的模式可能导致空值和字段,并且这些值和字段未正确保存计数和写入之间有什么代码?@Chris计数和写入之间没有代码。我照原样抄了is@PhanChuong我已经添加了示例数据集。请喝一杯look@PhanChuong我已经添加了代码,从开始得到更好的想法。请看一看
{"all_awardings":[],"allow_live_comments":false,"archived":false,"author":"yutacustoms","author_created_utc":1469348679,"author_flair_background_color":null,"author_flair_css_class":null,"author_flair_richtext":[],"author_flair_template_id":null,"author_flair_text":null,"author_flair_text_color":null,"author_flair_type":"text","author_fullname":"t2_zrj8k","author_patreon_flair":false,"can_gild":true,"can_mod_post":false,"category":null,"content_categories":null,"contest_mode":false,"created_utc":1564681493,"discussion_type":null,"distinguished":null,"domain":"etsy.com","edited":false,"event_end":1564686000.0,"event_is_live":false,"event_start":1564671600.0,"gilded":0,"gildings":{},"hidden":false,"id":"bvzcw9","is_crosspostable":true,"is_meta":false,"is_original_content":false,"is_reddit_media_domain":false,"is_robot_indexable":true,"is_self":false,"is_video":false,"link_flair_background_color":"","link_flair_css_class":null,"link_flair_richtext":[],"link_flair_text":null,"link_flair_text_color":"dark","link_flair_type":"text","locked":false,"media":null,"media_embed":{},"media_only":false,"no_follow":true,"num_comments":0,"num_crossposts":0,"over_18":false,"parent_whitelist_status":null,"permalink":"\/r\/yutacustoms\/comments\/bvzcw9\/end_eclipsedarksouls_art_original_art_oneofakind\/","pinned":false,"post_hint":"link","preview":{"enabled":false,"images":[{"id":"SfR2A2d_FDlAeCklFc64mCo7sBT_zvnnC7rgaaIHbQw","resolutions":[{"height":135,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=108&crop=smart&auto=webp&s=7daaffa01308287fc5049885fee97259383ed6a0","width":108},{"height":270,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=216&crop=smart&auto=webp&s=821c886a6bb6d2d5ee9e512e65685e6b9034df15","width":216},{"height":400,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?width=320&crop=smart&auto=webp&s=3f2a01a19976f8b809b4516c09743a0c31c2f800","width":320}],"source":{"height":713,"url":"https:\/\/external-preview.redd.it\/Ef8yUA_EHwgpXsZ9CYIeaLJezE8aBRMjtXQRF3IFbLU.jpg?auto=webp&s=91fcda37e4fd9a63c70910de753f4857421742e0","width":570},"variants":{}}]},"pwls":null,"quarantine":false,"removal_reason":null,"retrieved_on":1565755310,"score":1,"secure_media":null,"secure_media_embed":{},"selftext":"","send_replies":false,"spoiler":false,"stickied":false,"subreddit":"yutacustoms","subreddit_id":"t5_4br13","subreddit_name_prefixed":"r\/yutacustoms","subreddit_subscribers":104,"subreddit_type":"public","suggested_sort":null,"thumbnail":"https:\/\/a.thumbs.redditmedia.com\/R9ZVe9Fxk6mGG0-U8wdLBt6XTm1HQQae7LmProxlod8.jpg","thumbnail_height":140,"thumbnail_width":140,"title":"End Eclipse-DarkSouls Art\/\/ original art\/\/ oneofakind\/\/","total_awards_received":0,"url":"https:\/\/www.etsy.com\/ca\/listing\/690795754\/end-eclipse-darksouls-art-original-art?ref=shop_home_active_24&frs=1","whitelist_status":null,"wls":null}
df_schema_retrieve = spark.read.json('/corpus/small_dataset.bz2')

## load schema to json file
sch = df_schema_retrieve.schema.json()

with open('subreddit_schema.json', 'w') as file:
    file.write(sch)

reddit_schema = ''
with open('subreddit_schema.json', 'r') as f:
    reddit_schema = f.read()

reddit_schema = StructType.fromJson(json.loads(reddit_schema))


df = spark.read.json('/corpus/', reddit_schema)

# see the first line of the post to continue the flow