Python 成员匹配时在列表中合并词典

Python 成员匹配时在列表中合并词典,python,Python,我有一个包含名称、长度和IP地址的对象列表。只要名称和长度相同,我就想将它们组合在一起,将IP地址列表连接在一起 也就是说,给定以下JSON输入: { "Localfiles": [{ "IPAddress": ["217.120.103.158"], "FileLength": 7911088, "FileName": "desktop.jpeg" }, { "IPAddress": ["217.120.103.1

我有一个包含名称、长度和IP地址的对象列表。只要名称和长度相同,我就想将它们组合在一起,将IP地址列表连接在一起

也就是说,给定以下JSON输入:

{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}
…长度为
7911088
desktop.jpeg
文件存在两次,具有两个不同的IP地址。在输出中,应按如下方式合并这些内容:

{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158","133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }]
}
我目前的尝试如下:

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)
>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})

然而,这实际上并没有完成预期的操作。如何实现我的目的?

为此目的,合理的数据结构是(文件名、长度)元组到IP地址集的映射:

import collections

def collate(data):
    addresses=collections.defaultdict(set)
    for item in data:
        addresses[(item['FileName'], item['FileLength'])] |= set(item['IPAddress'])
    return addresses
输出类似于以下内容:

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)
>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})
…示例输出:

>>> from pprint import pprint
>>> pprint(decollate(collate(json.loads(jsonstring)['Localfiles'])))
[{'FileLength': 7924192,
  'FileName': u'Snelleplanga.mp4',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 282,
  'FileName': u'desktop.ini',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 7911088,
  'FileName': u'desktop.jpeg',
  'IPAddress': [u'217.120.103.158', u'133.234.44.122']}]

为此,合理的数据结构是(文件名、长度)元组到IP地址集的映射:

import collections

def collate(data):
    addresses=collections.defaultdict(set)
    for item in data:
        addresses[(item['FileName'], item['FileLength'])] |= set(item['IPAddress'])
    return addresses
输出类似于以下内容:

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)
>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})
…示例输出:

>>> from pprint import pprint
>>> pprint(decollate(collate(json.loads(jsonstring)['Localfiles'])))
[{'FileLength': 7924192,
  'FileName': u'Snelleplanga.mp4',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 282,
  'FileName': u'desktop.ini',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 7911088,
  'FileName': u'desktop.jpeg',
  'IPAddress': [u'217.120.103.158', u'133.234.44.122']}]

使用熊猫的解决方案:

import json
import pandas as pd

j = json.loads(jsonstring)
df = pd.DataFrame(j['Localfiles'])

df1 = df[df.duplicated(['FileLength', 'FileName'], keep=False)].groupby(['FileLength', 'FileName'])['IPAddress'].apply(lambda x: x.sum()).reset_index()    
df2 = df.drop_duplicates(['FileLength', 'FileName'], keep=False)    
df = pd.concat([df1, df2])

output_json = json.dumps(list(df.T.to_dict().values()))
输出JSON:

'[{'FileLength': 7911088,
  'FileName': 'desktop.jpeg',
  'IPAddress': ['217.120.103.158', '133.234.44.122']},
 {'FileLength': 7924192,
  'FileName': 'Snelleplanga.mp4',
  'IPAddress': ['217.120.103.158']},
 {'FileLength': 282,
  'FileName': 'desktop.ini',
  'IPAddress': ['217.120.103.158']}]'

使用熊猫的解决方案:

import json
import pandas as pd

j = json.loads(jsonstring)
df = pd.DataFrame(j['Localfiles'])

df1 = df[df.duplicated(['FileLength', 'FileName'], keep=False)].groupby(['FileLength', 'FileName'])['IPAddress'].apply(lambda x: x.sum()).reset_index()    
df2 = df.drop_duplicates(['FileLength', 'FileName'], keep=False)    
df = pd.concat([df1, df2])

output_json = json.dumps(list(df.T.to_dict().values()))
输出JSON:

'[{'FileLength': 7911088,
  'FileName': 'desktop.jpeg',
  'IPAddress': ['217.120.103.158', '133.234.44.122']},
 {'FileLength': 7924192,
  'FileName': 'Snelleplanga.mp4',
  'IPAddress': ['217.120.103.158']},
 {'FileLength': 282,
  'FileName': 'desktop.ini',
  'IPAddress': ['217.120.103.158']}]'
一个简单的解决方案:

dtmp={}
for d in jsonstring["Localfiles"]:
    ip= d["IPAddress"][0]
    key= (d["FileName"],d["FileLength"])
    dtmp.setdefault(key,[]).append(ip)

lrslt=[ {"IPAddress":ip,"FileLength":lth,"FileName":fname} for (fname,lth),ip in dtmp.items() ]
drslt={"Localfiles":lrslt}
print(drslt)
一个简单的解决方案:

dtmp={}
for d in jsonstring["Localfiles"]:
    ip= d["IPAddress"][0]
    key= (d["FileName"],d["FileLength"])
    dtmp.setdefault(key,[]).append(ip)

lrslt=[ {"IPAddress":ip,"FileLength":lth,"FileName":fname} for (fname,lth),ip in dtmp.items() ]
drslt={"Localfiles":lrslt}
print(drslt)

为什么将最后一个添加到第一个?是因为文件大小和文件名相同吗?例如,如果我有两个具有不同IP地址的外部源,并且两个源都具有相同文件名和文件长度,我希望两个IP地址都附加到一个数组中,如问题中所述。您的问题应指定该逻辑;现在它只涉及什么是“第一个”和“最后一个”。顺便说一句,如果再问一遍,我会避免把这当作一个JSON问题——您的数据可能只是Python文本,除了是否调用了
JSON.loads()
,什么都不会改变。为什么最后一个添加到第一个?是因为文件大小和文件名相同吗?例如,如果我有两个具有不同IP地址的外部源,并且两个源都具有相同文件名和文件长度,我希望两个IP地址都附加到一个数组中,如问题中所述。您的问题应指定该逻辑;现在它只涉及“第一个”和“最后一个”。顺便说一句,如果再问一遍,我会避免把这当作一个JSON问题——您的数据可能只是Python文本,除了是否调用
JSON.loads()
,什么都不会改变。旁注:对于Python 3,将
.iteritems()
替换为
.items()
)。旁注:对于Python 3,将
.iteritems()
替换为
.items()