Python 3.x CSV到嵌套JSON
我的CSV是这样的:Python 3.x CSV到嵌套JSON,python-3.x,Python 3.x,我的CSV是这样的: csId,lut,seqId,lvlId,lvlTyp,accSt,enrlDt,ptnrName,ptnrIds 27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,12345 27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,98765 27768303,1561939200,1,G,GAR,10,06-06-2018,fliggy,67890 68537125,15620
csId,lut,seqId,lvlId,lvlTyp,accSt,enrlDt,ptnrName,ptnrIds
27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,12345
27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,98765
27768303,1561939200,1,G,GAR,10,06-06-2018,fliggy,67890
68537125,1562025600,2,S,SAR,20,11-12-2014,fliggy,98696
但我得到的是:
[{"accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": "ptnrName":"Chase","ptnrIds":12345},
"seqId": 1,
"type": "mber"},
{"accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": {"ptnrName":"Chase","ptnrIds":98765},
"seqId": 1,
"type": "mber"},
{ "accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": {"ptnrName":"fliggy","ptnrIds":67890},
"seqId": 1,
"type": "mber"},
{ "accSt": "20",
"csId": 68537125,
"enrlDt": "11-12-2014",
"lut": 1562025600,
"lvlId": "S",
"lvlTyp": "SAR",
"ptnrlst": {"ptnrName":"Chase","ptnrIds":98696},
"seqId": 2,
"type": "mber"}]
我尝试使用以下代码:
from csv import DictReader
from itertools import groupby
from pprint import pprint
import fileinput
import time
def check_for_null_new(allvars):
first_split = allvars.split(',')
ret_val=""
loop_cnt=1
for second_split in first_split:
individual_split = second_split.split(':')
if not individual_split[1]:
pass
else:
if loop_cnt == 1:
if individual_split[1].isnumeric():
ret_val=(individual_split[0]+":"+individual_split[1])
else:
ret_val=
(individual_split[0]+":'"+individual_split[1]+"'")
else:
if individual_split[1].isnumeric():
ret_val=ret_val +','+
(individual_split[0]+":"+individual_split[1])
else:
ret_val=ret_val +','+
(individual_split[0]+":'"+individual_split[1]+"'")
loop_cnt = loop_cnt + 1
return (ret_val)
start_time = time.time()
with open('member.csv',encoding='utf-8-sig') as csvfile:
r1 = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r1]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: ( r['csId'], r['lut'], r['seqId'],r['lvlId'], r['lvlTyp'], r['accSt'], r['enrlDt'],r['ptnrName'],r['ptnrIds'])):
groups.append({ "type": "mber",
"csId": int(k[0]),
"lut": int(k[1]),
"seqId": int(k[2]),
"lvlId": k[3],
"lvlTyp": k[4],
"accSt": k[5],
"enrlDt": k[6],
"ptnrlst" : {check_for_null_new("'ptnrName':"+k[7]+",'ptnrIds':"+ k[8])}
})
uniquekeys.append(g)
with open('member.json', 'wt') as out:
pprint(groups, stream=out)
with fileinput.FileInput('member.json', inplace=True, backup='.bak') as file:
for line in file:
print(line.replace("\"", "").replace("'", "\""), end='')
end_time = time.time()
print("CSV to JSON Completed for Member in %s seconds " % (end_time - start_time))
我的预期产出是:
[
{
"type": "mber",
"csId": 27768303,
"lut": 1561939200,
"seqId":1,
"lvlId": "G",
"lvlTyp": "GAR",
"accSt": "10",
"enrlDt": "06-06-2018",
"ptnrlst":[{"ptnrName":"Chase",
"ptnrIds":["12345","98765"]
},
{"ptnrName":"fliggy",
"ptnrIds":["67890"]
}]
},
{
"type": "mber",
"csId": 68537125,
"lut": 1562025600,
"seqId":2,
"lvlId": "S",
"lvlTyp": "SAR",
"accSt": "20",
"enrlDt": "11-12-2014",
"ptnrlst":[{"ptnrName":"chase","ptnrIds":["98696"]
}]
}
]
特别回答以下问题:您在哪里读取一个
csv
文件,该文件输出一个包含嵌套对象的json
文件,在您的例子中,似乎是ptnrName
和ptnrIds
初始化和读取文件应该是直截了当的,并考虑到您可以轻松地将整个文件加载到内存中
import csv
import itertools
from operator import itemgetter
with open('members.csv', 'r') as csvfile:
all_ = list(csv.DictReader(csvfile, skipinitialspace=True))
当我们使用itertools
groupby
函数时,这将是除上述嵌套对象之外的所有字段
keys = 'csId lut seqId lvlId lvlTyp accSt enrlDt'.split()
all_.sort(key = itemgetter(*(keys)))
下一部分是使用2个groupby
函数创建嵌套操作
第一个groupby
标识键以及键是否具有嵌套对象,第二个groupby
根据ptnrName
对键进行分组。把它们放在一起,你会得到:
import csv
import itertools
import json
from pprint import pprint
from operator import itemgetter
with open('members.csv', 'r') as csvfile:
all_ = list(csv.DictReader(csvfile, skipinitialspace=True))
keys = 'csId lut seqId lvlId lvlTyp accSt enrlDt'.split() #list of keys
all_.sort(key = itemgetter(*(keys))) #inplace sort based on keys
ds = []
#1st groupby based on keys
for k, g in itertools.groupby(all_, key=lambda r: [r[i] for i in keys]):
d = {key:value for key, value in zip(keep,k)} #create the default key,values
d['seqId'] = int(d['seqId'])
for k1, g1 in itertools.groupby(g, key = lambda r: r['ptnrName']): #2nd groupby ptnrName
array = [i['ptnrIds'] for i in g1] #array of multiple ptnrIds based on ptnrName
#set default key ptnrlst to a list to store nested ptnrName, ptnrIds pairs
d.setdefault('ptnrlst', []).append({'ptnrName':k1, 'ptnrIds':array})
ds.append(d)
结果与预期一样,还要注意,在使用字典时,键没有排序
[{'accSt': '10',
'csId': '27768303',
'enrlDt': '06-06-2018',
'lut': '1561939200',
'lvlId': 'G',
'lvlTyp': 'GAR',
'ptnrlst': [{'ptnrIds': ['12345', '98765'], 'ptnrName': 'Chase'},
{'ptnrIds': ['67890'], 'ptnrName': 'fliggy'}],
'seqId': 1},
{'accSt': '20',
'csId': '68537125',
'enrlDt': '11-12-2014',
'lut': '1562025600',
'lvlId': 'S',
'lvlTyp': 'SAR',
'ptnrlst': [{'ptnrIds': ['98696'], 'ptnrName': 'fliggy'}],
'seqId': 2}]
最后转储到json:
with open('member.json', 'w') as jsonfile:
json.dump(ds, jsonfile)
with open('member.json', 'r') as jsonfile:
jload = json.load(jsonfile)
jload == ds
>>True
谢谢你,伯纳德。但是,所有的.sort(key=itemgetter(*(keep)))都会抛出“NameError:name'keep'未定义”。与语句d={key:value for key,value in zip(keep,k)}相同,键csId、lut和seqId必须是整数,但这里它们是stringsUpdated,key最初是keep,我添加了一行作为示例,在创建原始dict后将字段转换为
int
。没问题,编码愉快