Python 为json文件进行正确的输入_Python

Python 为json文件进行正确的输入

python

Python 为json文件进行正确的输入,python,Python,我尝试从project使用此命令我在mailcurps.json文件中使用这个json数据作为输入 [ { "id":12, "mailing_list_url":"12", "type_of_recipient":"before", "email_address":"test@test.org",

我尝试从project使用此命令

我在mailcurps.json文件中使用这个json数据作为输入

[
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"before",
      "email_address":"test@test.org",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"before",
      "email_address":"test@gmail.com",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"after",
      "email_address":"test@gmail.com",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]

该命令的第一部分是此代码

import rpy2.robjects as robjects
from bs4 import BeautifulSoup as BS4
from rpy2.robjects.packages import importr
import json
from email_reply_parser import EmailReplyParser

'''
NLoN training
'''
def training_nlon():
    nlon = importr('NLoN')
    #Path to NLoN training data
    robjects.r['load']('data/training_data.rda')

    return nlon, nlon.NLoNModel(robjects.r['text'], robjects.r['rater'])
'''
Gets mail corpus from email addresses
'''
def get_mail_corpus(nlon_cleaning=False):
    if (nlon_cleaning):
        nlon, nlon_model = training_nlon()

    #Path to mail's corpus
    corpus_file = 'data/mailcorpus.json'
    with open(corpus_file) as data_file:
        corpus = json.load(data_file)

    print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus)))
    dict = {}
    n = 0
    #Text cleaning
    for d in corpus:
        if d['type_of_recipient'] == 'From':
            # if not d['is_response_of'] == None:
            res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n'))
            text = res.reply
            # else:
            #     text = d['message_body'].replace('\\n', '\n')
            n += 1

            if (nlon_cleaning):
                try:
                    soup = BS4(text, 'html.parser')
                    clean_message_body = soup.text
                except Exception as e:
                    print('Error with BS4 on text:\n\n%s\n\n' % text, str(e))
                    clean_message_body = text.strip()
                message_by_lines = text.splitlines()
                list_length = len(message_by_lines)
                index = 0
                for count in range(0, list_length):
                    text1 = robjects.StrVector([message_by_lines[index]])
                    if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not':
                        del message_by_lines[index]
                    else:
                        index = index + 1
                clean_message_body = '\n'.join(message_by_lines)
                text = clean_message_body

            if not text == '':
                if d['email_address'] in dict:
                    dict[d['email_address']].append(text)
                else:
                    dict[d['email_address']] = [text]
        print(str(n)+'/'+str(len(corpus))+'\n' if n%50==0 else '', end='')

    print('Mails retrieved: '+ str(n))
    print('Email addresses: '+ str(len(dict)))
    return dict

它可以从github存储库链接获得，但我不拥有它

第二部分是以下代码：

import MailCorpus as mc
import sys, getopt, os
import csv

def main(argv):
    input_dataset = ''
    output_dir = ''
    dataset_path = ''
    nlon_cleaning = False
    try:
        opts, args = getopt.getopt(argv,"hi:o:p:nlon",["inputdataset=","outputdir=","datasetpath="])
    except getopt.GetoptError:
        print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h' or opt=='-help':
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()
        elif opt in ("-nlon"):
            nlon_cleaning = True
        elif opt in ("-i", "--inputdataset"):
            input_dataset = arg
        elif opt in ("-o", "--outputdir"):
            output_dir = arg
        elif opt in ("-p", "--datasetpath"):
            dataset_path = arg

    print('Dataset: '+ str(input_dataset))
    print('NLoN: '+ str(nlon_cleaning))
    print('Dataset path: ' + dataset_path)
    print('Output directory: ' + output_dir)

    dict = {}
    if input_dataset == 'Apache':
        dict = mc.get_mail_corpus(nlon_cleaning)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for k in dict.keys():
            text = '\n'.join(dict[k])
            with open(output_dir + '/' + str(k) + '.txt', "w") as text_file:
                print(text, file=text_file)

    else:
        if dataset_path == '':
            print('Wrong input dataset')
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()

        else:
            if input_dataset == 'LIWC':
                # Path to liwc gold standard
                # header is 'ID,text,cEXT,cNEU,cAGR,cCON,cOPN'

                with open(dataset_path, encoding='cp1252') as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter=',')
                    for row in csv_reader:
                        dict[row[0]] = row[1]

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for k in dict.keys():
                    with open(output_dir + '/' + str(k), "w") as text_file:
                        print(dict[k], file=text_file)

            else:
                print('Wrong input dataset')
                print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
                sys.exit()


if __name__ == "__main__":
   main(sys.argv[1:])

我认为json格式有点问题，因为它知道有3个对象，但没有一个对象将进入进程，并且在output_dir中没有任何文件

我能做些什么？

我发现您的json中有一些小错误，并对它们进行了更正，这是您可以使用的结果：

[ { id:12，邮寄地址：12，收件人的类型：之前，电邮地址：test@test.org, 信息正文：这里有一段文字用来测试情绪和感受快乐，你的回答是：之前吗 }, { id:21，邮寄地址：21，收件人的类型：之前，电邮地址：test@gmail.com, 信息正文：这里有一段文字用来测试情绪，感受快乐，感觉良好，你的回答是：之前吗 }, { id:21，邮寄地址：21，收件人的类型：之后，电邮地址：test@gmail.com, 信息_body：对这个代码感觉不太好，您的回复是：之后 } ]

如果你的代码仍然不能完成你想做的事情，你可以在下面添加一条评论，然后我会让baack加入你的代码。

干杯

我看到您的json中有一些小错误，请更正它们，这是您可以使用的结果：

如果你的代码仍然不能完成你想做的事情，你可以在下面添加一条评论，然后我会让baack加入你的代码。干杯

从您的代码中，我看到您的dict中收件人的类型应该来自，以便进行计算。试试这个：

[
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"From",
      "email_address":"test@test.org",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]

从您的代码中，我看到dict中收件人的类型应该是从，以便进行计算。试试这个：

[
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"From",
      "email_address":"test@test.org",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]

复制json并在上更正格式，您可以提前使用它。谢谢。我尝试了您的更正，但输出仍然是相同的复制json并更正格式，您可以提前使用。谢谢。我试过你的修正，但输出还是一样的

[
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"From",
      "email_address":"test@test.org",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"From",
      "email_address":"test@gmail.com",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]