Python 分离JSON中的唯一/重复数据
但我意识到,我无法确定我在哪个Python 分离JSON中的唯一/重复数据,python,json,dictionary,Python,Json,Dictionary,但我意识到,我无法确定我在哪个域中看到了我希望在期望结果中看到的事件 解决此类问题的最佳方法是什么?您可以首先展平字典,以获得与其关键路径和域关联的每个值。然后,使用密钥路径和域,可以创建频率表,从中可以创建新结构: 首先,将执行展平和重组的许多功能: from collections import defaultdict from itertools import product data = [{'name': 'audi', 'date': 1230768000, 'type': 'aut
域中看到了我希望在期望结果中看到的事件
解决此类问题的最佳方法是什么?您可以首先展平字典,以获得与其关键路径和域关联的每个值。然后,使用密钥路径和域,可以创建频率表,从中可以创建新结构:
首先,将执行展平和重组的许多功能:
from collections import defaultdict
from itertools import product
data = [{'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'power': {'unit': 'kW', 'value': 176}, 'doors': 5, 'domain': 'google.com'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'doors': 4, 'domain': 'facebook'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'grey', 'power': {'unit': 'kW', 'value': 200}, 'doors': 5, 'domain': 'facebook'}]
#get all the paths and domains
def get_paths(data, c = [], d = None):
if not isinstance(data, (dict, list)):
yield (tuple(c), (data, d))
elif isinstance(data, list):
yield from [i for j, k in enumerate(data) for i in get_paths(k, c = c+[j], d = d)]
else:
yield from [i for j, k in data.items() for i in get_paths(k, c=c+[j], d = d or data.get('domain'))]
#compute the domain frequencies for the paths
d1, d2 = defaultdict(list), defaultdict(dict)
for i in data:
for a, (val, domain) in get_paths(i):
if 'domain' not in a:
d2[tuple([*a, val])][domain] = d2[tuple([*a, val])].get(domain, 0)+1
d1[a].append(val)
#merge all the unique results
def to_dict(d):
_d = defaultdict(list)
for (a, *b), c in d:
_d[a].append((b, c))
if all(isinstance(i, int) for i in _d):
return [i for c in _d.values() for i in ([to_dict(c)] if all(k for k, _ in c) else [k for _, k in c])]
return {a:b[0][-1] if not b[0][0] else to_dict(b) for a, b in _d.items()}
#get the frequencies for the unique dict
def get_freq(d, c = []):
if isinstance(d, list):
if all(not isinstance(b, (dict, list)) for b in d):
return d2[(*c, 0, d[0])]
return [d2[(*c, i, a)] if not isinstance(a, dict) else get_freq(a, c+[i]) for i, a in enumerate(d)]
return {a:d2[(*c, a, b)] if not isinstance(b, (dict, list)) else get_freq(b, c+[a]) for a, b in d.items()}
#build repeating results
def get_rep(d, f = False):
_d = defaultdict(list)
for (a, *b), c in d:
_d[a].append((b, c))
if not f:
for a, b in _d.items():
if all(not j for j, _ in b):
yield from [{a:i} for _, k in b for i in set(k)]
else:
yield from [{a:i} for i in get_rep(b, True)]
else:
r = {a:set(b[0][-1]) for a, b in _d.items() if not b[0][0]}
for i in product(*r.values()):
flag = True
for a, b in _d.items():
if a not in r:
flag = False
for l in get_rep(b, True):
yield {**dict(zip(r.keys(), i)), **l}
if flag:
yield dict(zip(r.keys(), i))
#find all the values in a non unique block
def get_vals(d):
if not isinstance(d, (list, dict)):
yield d
else:
yield from [i for b in getattr(d, 'values', lambda :d)() for i in get_vals(b)]
#get frequencies for repeated items
def get_freq_rep(d):
r = {}
for a, b in d.items():
r[a] = [k for j, k in d2.items() if a in j and any(l in j for l in get_vals(b))][0]
return r
然后,把它们放在一起:
import json
u = [(a, b[0]) for a, b in d1.items() if len(set(b)) == 1 and not any(bool(set(j)&set(a)) and len(set(k)) != 1
for j, k in d1.items())]
u1 = [(a, b) for a, b in d1.items() if len(set(b)) > 1 or any(bool(set(j)&set(a)) and len(set(k)) > 1
for j, k in d1.items())]
result = {
'unique':{
'specs':(rd:=to_dict(u)),
'frequencies':get_freq(rd)
},
'repeating':[{'specs':i, 'frequencies':get_freq_rep(i)} for i in get_rep(u1)]
}
print(json.dumps(result, indent=4))
输出:
{
"unique": {
"specs": {
"name": "audi",
"date": 1230768000,
"type": "automatic",
"fuel": [
"Diesel"
]
},
"frequencies": {
"name": {
"google.com": 1,
"facebook": 2
},
"date": {
"google.com": 1,
"facebook": 2
},
"type": {
"google.com": 1,
"facebook": 2
},
"fuel": {
"google.com": 1,
"facebook": 2
}
}
},
"repeating": [
{
"specs": {
"color": "silver"
},
"frequencies": {
"color": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"color": "grey"
},
"frequencies": {
"color": {
"facebook": 1
}
}
},
{
"specs": {
"power": {
"unit": "kW",
"value": 176
}
},
"frequencies": {
"power": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"power": {
"unit": "kW",
"value": 200
}
},
"frequencies": {
"power": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"doors": 4
},
"frequencies": {
"doors": {
"facebook": 1
}
}
},
{
"specs": {
"doors": 5
},
"frequencies": {
"doors": {
"google.com": 1,
"facebook": 1
}
}
}
]
}
请注意,autoplius\u lt
在上述输出中不作为频率值存在,因为它不包括在第二个示例字典列表中
编辑:要删除不需要的重复结果,您可以在传递到get\u rep
之前过滤u1
的内容:
u1 = [(a, b) for a, b in u1 if a[0] not in ('spec_identification_manufacture_date','spec_powertrain_power')]
详细的回答,非常感谢。例如,如果我想从repeating
部分中排除一个度量,我会假设我可以使用'miss_matched':[{'specs':I,'frequencies':get_freq_rep(I)}作为get_rep(u1)中的I,如果u1[0][0][0]不在('spec u identification u制造日期','spec u动力总成功率')]
但是在字典理解中添加一个条件
会返回一个空的repeating
部分。如何排除所选指标?@JonasPalačionis请查看我最近的编辑。我建议过滤u1
(重复块)的内容,而不是在从get\u rep