Python 为json文件进行正确的输入
我尝试从project使用此命令 我在mailcurps.json文件中使用这个json数据作为输入Python 为json文件进行正确的输入,python,Python,我尝试从project使用此命令 我在mailcurps.json文件中使用这个json数据作为输入 [ { "id":12, "mailing_list_url":"12", "type_of_recipient":"before", "email_address":"test@test.org",
[
{
"id":12,
"mailing_list_url":"12",
"type_of_recipient":"before",
"email_address":"test@test.org",
"message_body":"Here is one text to test sentiment and feel happy",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"before",
"email_address":"test@gmail.com",
"message_body":"Here is one text to test sentiment and feel happy and feel fine",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"after",
"email_address":"test@gmail.com",
"message_body":"Not feel so good for this code",
"is_response_of":"after"
}
]
该命令的第一部分是此代码
import rpy2.robjects as robjects
from bs4 import BeautifulSoup as BS4
from rpy2.robjects.packages import importr
import json
from email_reply_parser import EmailReplyParser
'''
NLoN training
'''
def training_nlon():
nlon = importr('NLoN')
#Path to NLoN training data
robjects.r['load']('data/training_data.rda')
return nlon, nlon.NLoNModel(robjects.r['text'], robjects.r['rater'])
'''
Gets mail corpus from email addresses
'''
def get_mail_corpus(nlon_cleaning=False):
if (nlon_cleaning):
nlon, nlon_model = training_nlon()
#Path to mail's corpus
corpus_file = 'data/mailcorpus.json'
with open(corpus_file) as data_file:
corpus = json.load(data_file)
print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus)))
dict = {}
n = 0
#Text cleaning
for d in corpus:
if d['type_of_recipient'] == 'From':
# if not d['is_response_of'] == None:
res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n'))
text = res.reply
# else:
# text = d['message_body'].replace('\\n', '\n')
n += 1
if (nlon_cleaning):
try:
soup = BS4(text, 'html.parser')
clean_message_body = soup.text
except Exception as e:
print('Error with BS4 on text:\n\n%s\n\n' % text, str(e))
clean_message_body = text.strip()
message_by_lines = text.splitlines()
list_length = len(message_by_lines)
index = 0
for count in range(0, list_length):
text1 = robjects.StrVector([message_by_lines[index]])
if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not':
del message_by_lines[index]
else:
index = index + 1
clean_message_body = '\n'.join(message_by_lines)
text = clean_message_body
if not text == '':
if d['email_address'] in dict:
dict[d['email_address']].append(text)
else:
dict[d['email_address']] = [text]
print(str(n)+'/'+str(len(corpus))+'\n' if n%50==0 else '', end='')
print('Mails retrieved: '+ str(n))
print('Email addresses: '+ str(len(dict)))
return dict
它可以从github存储库链接获得,但我不拥有它
第二部分是以下代码:
import MailCorpus as mc
import sys, getopt, os
import csv
def main(argv):
input_dataset = ''
output_dir = ''
dataset_path = ''
nlon_cleaning = False
try:
opts, args = getopt.getopt(argv,"hi:o:p:nlon",["inputdataset=","outputdir=","datasetpath="])
except getopt.GetoptError:
print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h' or opt=='-help':
print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
sys.exit()
elif opt in ("-nlon"):
nlon_cleaning = True
elif opt in ("-i", "--inputdataset"):
input_dataset = arg
elif opt in ("-o", "--outputdir"):
output_dir = arg
elif opt in ("-p", "--datasetpath"):
dataset_path = arg
print('Dataset: '+ str(input_dataset))
print('NLoN: '+ str(nlon_cleaning))
print('Dataset path: ' + dataset_path)
print('Output directory: ' + output_dir)
dict = {}
if input_dataset == 'Apache':
dict = mc.get_mail_corpus(nlon_cleaning)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for k in dict.keys():
text = '\n'.join(dict[k])
with open(output_dir + '/' + str(k) + '.txt', "w") as text_file:
print(text, file=text_file)
else:
if dataset_path == '':
print('Wrong input dataset')
print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
sys.exit()
else:
if input_dataset == 'LIWC':
# Path to liwc gold standard
# header is 'ID,text,cEXT,cNEU,cAGR,cCON,cOPN'
with open(dataset_path, encoding='cp1252') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
dict[row[0]] = row[1]
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for k in dict.keys():
with open(output_dir + '/' + str(k), "w") as text_file:
print(dict[k], file=text_file)
else:
print('Wrong input dataset')
print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
sys.exit()
if __name__ == "__main__":
main(sys.argv[1:])
我认为json格式有点问题,因为它知道有3个对象,但没有一个对象将进入进程,并且在output_dir中没有任何文件
我能做些什么?我发现您的json中有一些小错误,并对它们进行了更正,这是您可以使用的结果: [ { id:12, 邮寄地址:12, 收件人的类型:之前, 电邮地址:test@test.org, 信息正文:这里有一段文字用来测试情绪和感受快乐, 你的回答是:之前吗 }, { id:21, 邮寄地址:21, 收件人的类型:之前, 电邮地址:test@gmail.com, 信息正文:这里有一段文字用来测试情绪,感受快乐,感觉良好, 你的回答是:之前吗 }, { id:21, 邮寄地址:21, 收件人的类型:之后, 电邮地址:test@gmail.com, 信息_body:对这个代码感觉不太好, 您的回复是:之后 } ] 如果你的代码仍然不能完成你想做的事情,你可以在下面添加一条评论,然后我会让baack加入你的代码。
干杯我看到您的json中有一些小错误,请更正它们,这是您可以使用的结果: [ { id:12, 邮寄地址:12, 收件人的类型:之前, 电邮地址:test@test.org, 信息正文:这里有一段文字用来测试情绪和感受快乐, 你的回答是:之前吗 }, { id:21, 邮寄地址:21, 收件人的类型:之前, 电邮地址:test@gmail.com, 信息正文:这里有一段文字用来测试情绪,感受快乐,感觉良好, 你的回答是:之前吗 }, { id:21, 邮寄地址:21, 收件人的类型:之后, 电邮地址:test@gmail.com, 信息_body:对这个代码感觉不太好, 您的回复是:之后 } ] 如果你的代码仍然不能完成你想做的事情,你可以在下面添加一条评论,然后我会让baack加入你的代码。 干杯从您的代码中,我看到您的dict中收件人的类型应该来自,以便进行计算。试试这个:
[
{
"id":12,
"mailing_list_url":"12",
"type_of_recipient":"From",
"email_address":"test@test.org",
"message_body":"Here is one text to test sentiment and feel happy",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Here is one text to test sentiment and feel happy and feel fine",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Not feel so good for this code",
"is_response_of":"after"
}
]
从您的代码中,我看到dict中收件人的类型应该是从,以便进行计算。试试这个:
[
{
"id":12,
"mailing_list_url":"12",
"type_of_recipient":"From",
"email_address":"test@test.org",
"message_body":"Here is one text to test sentiment and feel happy",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Here is one text to test sentiment and feel happy and feel fine",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Not feel so good for this code",
"is_response_of":"after"
}
]
复制json并在上更正格式,您可以提前使用它。谢谢。我尝试了您的更正,但输出仍然是相同的复制json并更正格式,您可以提前使用。谢谢。我试过你的修正,但输出还是一样的
[
{
"id":12,
"mailing_list_url":"12",
"type_of_recipient":"From",
"email_address":"test@test.org",
"message_body":"Here is one text to test sentiment and feel happy",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Here is one text to test sentiment and feel happy and feel fine",
"is_response_of":"before"
},
{
"id":21,
"mailing_list_url":"21",
"type_of_recipient":"From",
"email_address":"test@gmail.com",
"message_body":"Not feel so good for this code",
"is_response_of":"after"
}
]