在Python中将tweet数据解析为csv时出错

在Python中将tweet数据解析为csv时出错,python,json,csv,twitter,tweepy,Python,Json,Csv,Twitter,Tweepy,我目前正在做一个项目,我想提取tweet文本和创建时间,并将这些数据放在csv文件中。我正在分析的文件是包含JSON数据的大型文本文件(~800MB-1.5GB)。我使用了下面的程序来获取这些数据。我已经通过管道将其输出到一个文本文件中 import tweepy as tp import sys import pandas as pd #Variables that contains the user credentials to access Twitter API access_tok

我目前正在做一个项目,我想提取tweet文本和创建时间,并将这些数据放在csv文件中。我正在分析的文件是包含JSON数据的大型文本文件(~800MB-1.5GB)。我使用了下面的程序来获取这些数据。我已经通过管道将其输出到一个文本文件中

import tweepy as tp
import sys
import pandas as pd


#Variables that contains the user credentials to access Twitter API
access_token = "..."
access_token_secret = "..."
consumer_key = "..."
consumer_secret = "..."

tweets_data = []
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(tp.StreamListener):

  def on_data(self, data):
    print (data)

    return True



  def on_error(self, status):
    print (status)


if __name__ == '__main__':

    #This handles Twitter authentication and the connection to Twitter   Streaming API
    l = StdOutListener()
    auth = tp.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = tp.Stream(auth, l)

    stream.filter(track=['Manchester United'])
编辑:这是上述程序的输出示例

{"created_at":"Mon Feb 09 07:58:51 +0000 2015","id":564694906233307137,"id_str":"564694906233307137","text":"RT @ManUtd: Take an alternative look at United's starting line-up today, courtesy of #MUTV. #mufclive\nhttps:\/\/t.co\/m1n1JkgRYq","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":306297595,"id_str":"306297595","name":"Agus Wiratama","screen_name":"KunirKm","location":"Bali","url":null,"description":"girls that are uniqe and beautiful in their own way|| #GGMU #Libra #IG : @Kunirkm","protected":false,"verified":false,"followers_count":176,"friends_count":102,"listed_count":1,"favourites_count":39,"statuses_count":4810,"created_at":"Fri May 27 16:45:02 +0000 2011","utc_offset":-32400,"time_zone":"Alaska","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/561223265025138688\/J3SFBWV4_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/561223265025138688\/J3SFBWV4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/306297595\/1400412027","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Feb 08 15:52:42 +0000 2015","id":564451764460474369,"id_str":"564451764460474369","text":"Take an alternative look at United's starting line-up today, courtesy of #MUTV. #mufclive\nhttps:\/\/t.co\/m1n1JkgRYq","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":558797310,"id_str":"558797310","name":"Manchester United","screen_name":"ManUtd","location":"#mufc","url":"http:\/\/www.manutd.com","description":"Official Twitter of Manchester United FC","protected":false,"verified":true,"followers_count":4388116,"friends_count":84,"listed_count":12006,"favourites_count":0,"statuses_count":11840,"created_at":"Fri Apr 20 15:17:43 +0000 2012","utc_offset":0,"time_zone":"Casablanca","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/491881264232677376\/VcPcDO7o.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/491881264232677376\/VcPcDO7o.jpeg","profile_background_tile":false,"profile_link_color":"B30000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/563854496074194947\/p74gErkN_normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/563854496074194947\/p74gErkN_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/558797310\/1423268331","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1338,"favorite_count":752,"entities":{"hashtags":[{"text":"MUTV","indices":[73,78]},{"text":"mufclive","indices":[80,89]}],"trends":[],"urls":[{"url":"https:\/\/t.co\/m1n1JkgRYq","expanded_url":"https:\/\/amp.twimg.com\/v\/c79db33a-7fa9-4993-be9d-12990ee17b6b","display_url":"amp.twimg.com\/v\/c79db33a-7fa\u2026","indices":[90,113]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"retweet_count":0,"favorite_count":0,"entities":
然后,我尝试读取此文件,以提取使用此程序所需的信息

import simplejson as json
from pandas import DataFrame as df
import time

if __name__ == "__main__":
    tweets_data_path = '/input.txt' #Input file path
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    tweets = df()
    tweets1 = df()
    i=0
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
            i+=1
            print(i)
            if i > 10000:
                i=0
                tweets['CreatedAt'] = [tweet["created_at"] for tweet in tweets_data]
                tweets['text'] = [tweet["text"] for tweet in tweets_data]
                print(tweets_data)
                timestr = time.strftime("%Y%m%d-%H%M%S")
                filename = 'Out' + timestr + '.csv'
                print(filename)
                tweets.to_csv(filename, index=True)
                tweets_data.clear()
                tweets.drop()
                print(tweets)

            #print('Here I am')
        except:
            continue

    try:
        #Creating a new data frame as the old one creates a conflict with the size of the index
        tweets1['CreatedAt'] = [tweet["created_at"] for tweet in tweets_data]
        tweets1['text'] = [tweet["text"] for tweet in tweets_data]
        timestr1 = time.strftime("%Y%m%d-%H%M%S")
        filename1 = 'Out' + timestr1 + '.csv'
        tweets1.to_csv(filename, index= True)
    except:
        print('Excepti')
问题是,当代码在JSON文件的整个长度上继续运行时,我无法保存多个csv文件。我是否在循环或管理异常时出错?
一般来说,我对Python和编程相当陌生。感谢您的帮助。

因为我没有资格证书,所以我很难复制。你能至少粘贴第一个程序的一些输出吗?将输出添加到问题中。你已经编辑了问题。谢谢你的帮助。嗯,你只发了一行,还不完整。我假设您实际上有很多行,并且您创建的文件创建正确。在这种情况下,问题可能是文件名,它只会在一整秒钟后更改(一秒钟对计算机来说太多了)。添加一个
filename=str(i)+filename
或类似的内容,然后重试。不,我认为问题不在于文件名。我可以在控制台中看到打印的数字(I),在再次从0开始之前,它运行了大约30秒。所以我认为这不是问题所在。但我会尝试做点什么改变这一点。