python3中的编码问题
我正在学习sentdex关于使用reddit注释制作聊天机器人的教程,并下载了一个月的注释作为.bz2文件 当我运行下面链接的代码时,我得到了错误python3中的编码问题,python,python-3.x,sqlite,encode,chatbot,Python,Python 3.x,Sqlite,Encode,Chatbot,我正在学习sentdex关于使用reddit注释制作聊天机器人的教程,并下载了一个月的注释作为.bz2文件 当我运行下面链接的代码时,我得到了错误 Traceback (most recent call last): File "/Users/my_user/Documents/ok.py", line 94, in <module> for row in x: File "/Library/Frameworks/Python.framework/Versions/3.8/l
Traceback (most recent call last):
File "/Users/my_user/Documents/ok.py", line 94, in <module>
for row in x:
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322,
in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd9 in position 13: invalid continuation
byte
我的代码如下:
import sqlite3
import json
import datetime
timeframe = '2015-01'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def transaction_bldr(sql):
global sql_transaction
sql_transaction.append(sql)
if len(sql_transaction) > 1000:
c.execute('BEGIN TRANSACTION')
for s in sql_transaction:
try:
c.execute(s)
except:
pass
connection.commit()
sql_transaction = []
def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',e)
def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',e)
def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',e)
def acceptable(data):
if len(data.split(' ')) > 50 or len(data) < 1:
return False
elif len(data) > 1000:
return False
elif data == '[deleted]':
return False
elif data == '[removed]':
return False
else:
return True
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
def find_existing_score(pid):
try:
sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe), buffering=1000) as x:
for row in x:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
if existing_comment_score:
if score > existing_comment_score:
if acceptable(body):
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if acceptable(body):
if parent_data:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows += 1
else:
sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
if row_counter % 100000 == 0:
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
谢谢你的时间
顺便说一句,如果我问了一个愚蠢的问题,我对sqlite和json非常陌生,您试图将bz2压缩文件读取为文本文件
import bz2
if __name__ == '__main__':
create_table()
paired_rows = 0
with bz2.open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe)) as rows:
for row_counter, row in enumerate(rows, 1):
row = json.loads(row)
...
但这不是唯一的问题。不要将参数格式化为SQL语句。例如:
def find_parent(pid):
sql = "SELECT comment FROM parent_reply WHERE comment_id = ? LIMIT 1"
c.execute(sql, [pid])
result = c.fetchone()
if result is not None:
return result[0]
return False
但是函数应该有一个更好的名称,比如fetch_comment,如果没有注释,它不应该返回False,没有更好的。注释id或父id是否正确
总之,这应该是这样的:
import sqlite3
import json
import datetime
import bz2
TIMEFRAME = '2015-01'
def create_table(cursor):
cursor.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
return data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
def sql_insert_replace_comment(cursor, commentid,parentid,parent,comment,subreddit,time,score):
cursor.execute("""UPDATE parent_reply SET comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""",
[commentid, parent, comment, subreddit, int(time), score, parentid])
def sql_insert(cursor, commentid,parentid,parent,comment,subreddit,time,score):
cursor.execute("""INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES (?,?,?,?,?,?,?);""",
[parentid, commentid, parent, comment, subreddit, int(time), score])
def acceptable(data):
return (
len(data.split(' ')) <= 50
and 1 <= len(data) <= 1000
and data not in ('[deleted]','[removed]')
)
def fetch_comment(cursor, pid):
# TODO: is this comment or parent data?
cursor.execute("SELECT comment FROM parent_reply WHERE comment_id = ? LIMIT 1", [pid])
result = cursor.fetchone()
return result and result[0]
def fetch_score(cursor, pid):
cursor.execute("SELECT score FROM parent_reply WHERE parent_id = ? LIMIT 1", [pid])
result = cursor.fetchone()
return result and result[0]
def main():
connection = sqlite3.connect('{}.db'.format(TIMEFRAME))
cursor = connection.cursor()
create_table(cursor)
paired_rows = 0
with bz2.open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe)) as rows:
for row_counter, row in enumerate(rows, 1):
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = fetch_comment(cursor, parent_id) # or is this comment?
if score >= 2 and acceptable(body):
existing_comment_score = fetch_score(cursor, parent_id)
if existing_comment_score is not None:
if score > existing_comment_score:
sql_insert_replace_comment(cursor, comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if parent_data:
paired_rows += 1
sql_insert(cursor, comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
if row_counter % 100000 == 0:
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
if __name__ == '__main__':
main()
这是一个5gb的文件,所以我不认为我可以发送到这里。我如何纠正它以这种方式编码?您需要将正确的编码传递给open函数:opensomething.bz2,encoding=XXX,在这里替换XXX。默认情况下,它是utf-8,但reddit数据转储似乎在使用其他内容。您正在尝试将压缩的bz2文件读取为文本文件。我有一个关于函数fetch_score和fetch_comment的快速问题:它们都使用return result和result[0]。你能解释一下那句话背后的逻辑吗?据我所知,这是多余的,任何一个都做同样的工作。