Python 分离JSON中的唯一/重复数据_Python_Json_Dictionary

Python 分离JSON中的唯一/重复数据

python json dictionary

Python 分离JSON中的唯一/重复数据,python,json,dictionary,Python,Json,Dictionary,但我意识到，我无法确定我在哪个域中看到了我希望在期望结果中看到的事件解决此类问题的最佳方法是什么？您可以首先展平字典，以获得与其关键路径和域关联的每个值。然后，使用密钥路径和域，可以创建频率表，从中可以创建新结构：首先，将执行展平和重组的许多功能： from collections import defaultdict from itertools import product data = [{'name': 'audi', 'date': 1230768000, 'type': 'aut

但我意识到，我无法确定我在哪个

域中看到了我希望在期望结果中看到的事件
解决此类问题的最佳方法是什么？
您可以首先展平字典，以获得与其关键路径和域关联的每个值。然后，使用密钥路径和域，可以创建频率表，从中可以创建新结构：
首先，将执行展平和重组的许多功能：
from collections import defaultdict
from itertools import product
data = [{'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'power': {'unit': 'kW', 'value': 176}, 'doors': 5, 'domain': 'google.com'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'doors': 4, 'domain': 'facebook'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'grey', 'power': {'unit': 'kW', 'value': 200}, 'doors': 5, 'domain': 'facebook'}]
#get all the paths and domains
def get_paths(data, c = [], d = None):
   if not isinstance(data, (dict, list)):
      yield (tuple(c), (data, d))
   elif isinstance(data, list):
      yield from [i for j, k in enumerate(data) for i in get_paths(k, c = c+[j], d = d)]
   else:
      yield from [i for j, k in data.items() for i in get_paths(k, c=c+[j], d = d or data.get('domain'))]

#compute the domain frequencies for the paths
d1, d2 = defaultdict(list), defaultdict(dict)
for i in data:
   for a, (val, domain) in get_paths(i):
     if 'domain' not in a:
        d2[tuple([*a, val])][domain] = d2[tuple([*a, val])].get(domain, 0)+1
        d1[a].append(val)

#merge all the unique results
def to_dict(d):
   _d = defaultdict(list)
   for (a, *b), c in d:
      _d[a].append((b, c))
   if all(isinstance(i, int) for i in _d):
      return [i for c in _d.values() for i in ([to_dict(c)] if all(k for k, _ in c) else [k for _, k in c])]
   return {a:b[0][-1] if not b[0][0] else to_dict(b) for a, b in _d.items()}

#get the frequencies for the unique dict
def get_freq(d, c = []):
  if isinstance(d, list):
     if all(not isinstance(b, (dict, list)) for b in d):
        return d2[(*c, 0, d[0])]
     return [d2[(*c, i, a)] if not isinstance(a, dict) else get_freq(a, c+[i]) for i, a in enumerate(d)]
  return {a:d2[(*c, a, b)] if not isinstance(b, (dict, list)) else get_freq(b, c+[a]) for a, b in d.items()}

#build repeating results
def get_rep(d, f = False):
  _d = defaultdict(list)
  for (a, *b), c in d:
      _d[a].append((b, c))
  if not f:
     for a, b in _d.items():
        if all(not j for j, _ in b):
            yield from [{a:i} for _, k in b for i in set(k)]
        else:
            yield from [{a:i} for i in get_rep(b, True)]
  else:
     r = {a:set(b[0][-1]) for a, b in _d.items() if not b[0][0]}
     for i in product(*r.values()):
        flag = True
        for a, b in _d.items():
           if a not in r:
              flag = False
              for l in get_rep(b, True):
                 yield {**dict(zip(r.keys(), i)), **l}
        if flag:
           yield dict(zip(r.keys(), i))


#find all the values in a non unique block
def get_vals(d):
   if not isinstance(d, (list, dict)):
      yield d
   else:
      yield from [i for b in getattr(d, 'values', lambda :d)() for i in get_vals(b)]
   
#get frequencies for repeated items
def get_freq_rep(d):
   r = {}
   for a, b in d.items():
      r[a] = [k for j, k in d2.items() if a in j and any(l in j for l in get_vals(b))][0]
   return r


然后，把它们放在一起：
import json
u = [(a, b[0]) for a, b in d1.items() if len(set(b)) == 1 and not any(bool(set(j)&set(a)) and len(set(k)) != 1 
     for j, k in d1.items())]
u1 = [(a, b) for a, b in d1.items() if len(set(b)) > 1 or any(bool(set(j)&set(a)) and len(set(k)) > 1 
     for j, k in d1.items())]
result = {
          'unique':{
               'specs':(rd:=to_dict(u)), 
               'frequencies':get_freq(rd)
           },
           'repeating':[{'specs':i, 'frequencies':get_freq_rep(i)} for i in get_rep(u1)]
         }
print(json.dumps(result, indent=4))

输出：
{
   "unique": {
      "specs": {
         "name": "audi",
        "date": 1230768000,
        "type": "automatic",
        "fuel": [
            "Diesel"
        ]
    },
    "frequencies": {
        "name": {
            "google.com": 1,
            "facebook": 2
        },
        "date": {
            "google.com": 1,
            "facebook": 2
        },
        "type": {
            "google.com": 1,
            "facebook": 2
        },
        "fuel": {
            "google.com": 1,
            "facebook": 2
        }
    }
},
"repeating": [
    {
        "specs": {
            "color": "silver"
        },
        "frequencies": {
            "color": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "color": "grey"
        },
        "frequencies": {
            "color": {
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "power": {
                "unit": "kW",
                "value": 176
            }
        },
        "frequencies": {
            "power": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "power": {
                "unit": "kW",
                "value": 200
            }
        },
        "frequencies": {
            "power": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "doors": 4
        },
        "frequencies": {
            "doors": {
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "doors": 5
        },
        "frequencies": {
            "doors": {
                "google.com": 1,
                "facebook": 1
              }
          }
      }
   ]
}

请注意，autoplius\u lt
在上述输出中不作为频率值存在，因为它不包括在第二个示例字典列表中
编辑：要删除不需要的重复结果，您可以在传递到get\u rep
之前过滤u1
的内容：
u1 = [(a, b) for a, b in u1 if a[0] not in ('spec_identification_manufacture_date','spec_powertrain_power')]

详细的回答，非常感谢。例如，如果我想从repeating
部分中排除一个度量，我会假设我可以使用'miss_matched'：[{'specs'：I，'frequencies'：get_freq_rep（I）}作为get_rep（u1）中的I，如果u1[0][0][0]不在（'spec u identification u制造日期'，'spec u动力总成功率'）]
但是在字典理解中添加一个条件
会返回一个空的repeating
部分。如何排除所选指标？@JonasPalačionis请查看我最近的编辑。我建议过滤u1
（重复块）的内容，而不是在从get\u rep