Python 成员匹配时在列表中合并词典_Python

Python 成员匹配时在列表中合并词典

python

Python 成员匹配时在列表中合并词典,python,Python,我有一个包含名称、长度和IP地址的对象列表。只要名称和长度相同，我就想将它们组合在一起，将IP地址列表连接在一起也就是说，给定以下JSON输入： { "Localfiles": [{ "IPAddress": ["217.120.103.158"], "FileLength": 7911088, "FileName": "desktop.jpeg" }, { "IPAddress": ["217.120.103.1

我有一个包含名称、长度和IP地址的对象列表。只要名称和长度相同，我就想将它们组合在一起，将IP地址列表连接在一起

也就是说，给定以下JSON输入：

{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}

…长度为

的

desktop.jpeg

文件存在两次，具有两个不同的IP地址。在输出中，应按如下方式合并这些内容：

{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158","133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }]
}

我目前的尝试如下：

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)

>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})

然而，这实际上并没有完成预期的操作。如何实现我的目的？

为此目的，合理的数据结构是（文件名、长度）元组到IP地址集的映射：

import collections

def collate(data):
    addresses=collections.defaultdict(set)
    for item in data:
        addresses[(item['FileName'], item['FileLength'])] |= set(item['IPAddress'])
    return addresses

输出类似于以下内容：

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)

>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})

…示例输出：

>>> from pprint import pprint
>>> pprint(decollate(collate(json.loads(jsonstring)['Localfiles'])))
[{'FileLength': 7924192,
  'FileName': u'Snelleplanga.mp4',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 282,
  'FileName': u'desktop.ini',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 7911088,
  'FileName': u'desktop.jpeg',
  'IPAddress': [u'217.120.103.158', u'133.234.44.122']}]

为此，合理的数据结构是（文件名、长度）元组到IP地址集的映射：

import collections

def collate(data):
    addresses=collections.defaultdict(set)
    for item in data:
        addresses[(item['FileName'], item['FileLength'])] |= set(item['IPAddress'])
    return addresses

输出类似于以下内容：

import json

jsonstring = '''{
    "Localfiles": [{
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 7924192,
        "FileName": "Snelleplanga.mp4"
    }, {
        "IPAddress": ["217.120.103.158"],
        "FileLength": 282,
        "FileName": "desktop.ini"
    }, {
        "IPAddress": ["133.234.44.122"],
        "FileLength": 7911088,
        "FileName": "desktop.jpeg"
    }]
}'''

def test(data):
    dictionary = {}  
    dictionary['Localfiles'] = []
    s = json.loads(data)
    content = s["Localfiles"]
    for item in content:
        ipaddrarr = item["IPAddress"]
        ipaddr = ipaddrarr[0]
        filelen = item["FileLength"]
        filename = item["FileName"]
        dictionarychild = {}
        dictionarychild["IPAddress"] = []
        dictionarychild["IPAddress"].append(ipaddr)
        dictionarychild["FileLength"] = filelen
        dictionarychild["FileName"] = filename
        dictionary["Localfiles"].append(dictionarychild)
    print(dictionary)
test(jsonstring)

>>> import json
>>> collate(json.loads(jsonstring)['Localfiles'])
defaultdict(<type 'set'>, {(u'Snelleplanga.mp4', 7924192): set([u'217.120.103.158']), (u'desktop.ini', 282): set([u'217.120.103.158']), (u'desktop.jpeg', 7911088): set([u'217.120.103.158', u'133.234.44.122'])})

…示例输出：

>>> from pprint import pprint
>>> pprint(decollate(collate(json.loads(jsonstring)['Localfiles'])))
[{'FileLength': 7924192,
  'FileName': u'Snelleplanga.mp4',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 282,
  'FileName': u'desktop.ini',
  'IPAddress': [u'217.120.103.158']},
 {'FileLength': 7911088,
  'FileName': u'desktop.jpeg',
  'IPAddress': [u'217.120.103.158', u'133.234.44.122']}]

使用熊猫的解决方案：

import json
import pandas as pd

j = json.loads(jsonstring)
df = pd.DataFrame(j['Localfiles'])

df1 = df[df.duplicated(['FileLength', 'FileName'], keep=False)].groupby(['FileLength', 'FileName'])['IPAddress'].apply(lambda x: x.sum()).reset_index()    
df2 = df.drop_duplicates(['FileLength', 'FileName'], keep=False)    
df = pd.concat([df1, df2])

output_json = json.dumps(list(df.T.to_dict().values()))

输出JSON:

'[{'FileLength': 7911088,
  'FileName': 'desktop.jpeg',
  'IPAddress': ['217.120.103.158', '133.234.44.122']},
 {'FileLength': 7924192,
  'FileName': 'Snelleplanga.mp4',
  'IPAddress': ['217.120.103.158']},
 {'FileLength': 282,
  'FileName': 'desktop.ini',
  'IPAddress': ['217.120.103.158']}]'

使用熊猫的解决方案：

import json
import pandas as pd

j = json.loads(jsonstring)
df = pd.DataFrame(j['Localfiles'])

df1 = df[df.duplicated(['FileLength', 'FileName'], keep=False)].groupby(['FileLength', 'FileName'])['IPAddress'].apply(lambda x: x.sum()).reset_index()    
df2 = df.drop_duplicates(['FileLength', 'FileName'], keep=False)    
df = pd.concat([df1, df2])

output_json = json.dumps(list(df.T.to_dict().values()))

输出JSON:

'[{'FileLength': 7911088,
  'FileName': 'desktop.jpeg',
  'IPAddress': ['217.120.103.158', '133.234.44.122']},
 {'FileLength': 7924192,
  'FileName': 'Snelleplanga.mp4',
  'IPAddress': ['217.120.103.158']},
 {'FileLength': 282,
  'FileName': 'desktop.ini',
  'IPAddress': ['217.120.103.158']}]'

一个简单的解决方案：

dtmp={}
for d in jsonstring["Localfiles"]:
    ip= d["IPAddress"][0]
    key= (d["FileName"],d["FileLength"])
    dtmp.setdefault(key,[]).append(ip)

lrslt=[ {"IPAddress":ip,"FileLength":lth,"FileName":fname} for (fname,lth),ip in dtmp.items() ]
drslt={"Localfiles":lrslt}
print(drslt)

一个简单的解决方案：

dtmp={}
for d in jsonstring["Localfiles"]:
    ip= d["IPAddress"][0]
    key= (d["FileName"],d["FileLength"])
    dtmp.setdefault(key,[]).append(ip)

lrslt=[ {"IPAddress":ip,"FileLength":lth,"FileName":fname} for (fname,lth),ip in dtmp.items() ]
drslt={"Localfiles":lrslt}
print(drslt)

为什么将最后一个添加到第一个？是因为文件大小和文件名相同吗？例如，如果我有两个具有不同IP地址的外部源，并且两个源都具有相同文件名和文件长度，我希望两个IP地址都附加到一个数组中，如问题中所述。您的问题应指定该逻辑；现在它只涉及什么是“第一个”和“最后一个”。顺便说一句，如果再问一遍，我会避免把这当作一个JSON问题——您的数据可能只是Python文本，除了是否调用了

JSON.loads（）

，什么都不会改变。为什么最后一个添加到第一个？是因为文件大小和文件名相同吗？例如，如果我有两个具有不同IP地址的外部源，并且两个源都具有相同文件名和文件长度，我希望两个IP地址都附加到一个数组中，如问题中所述。您的问题应指定该逻辑；现在它只涉及“第一个”和“最后一个”。顺便说一句，如果再问一遍，我会避免把这当作一个JSON问题——您的数据可能只是Python文本，除了是否调用

JSON.loads（）

，什么都不会改变。旁注：对于Python 3，将

.iteritems（）

替换为

.items（）

）。旁注：对于Python 3，将

.iteritems（）

替换为

.items（）

。