Python 如何通过多个规则有效地过滤巨大的列表?

Python 如何通过多个规则有效地过滤巨大的列表?,python,python-3.x,design-patterns,list-comprehension,Python,Python 3.x,Design Patterns,List Comprehension,我正在编写一个开源的PyPi包,它应该能够过滤AWS EC2实例 在我的函数ec_compare__from_dict中,我过滤了磁盘上占用364Kb的350多个元素的列表 以下执行示例返回1个筛选元素: >>> ec_compare__from_dict(_partial=_partial,InstanceType='z1d',FreeTierEligible=False,SupportedUsageClasses='spot',BareMetal=True) [{'Inst

我正在编写一个开源的PyPi包,它应该能够过滤AWS EC2实例

在我的函数ec_compare__from_dict中,我过滤了磁盘上占用364Kb的350多个元素的列表

以下执行示例返回1个筛选元素:

>>> ec_compare__from_dict(_partial=_partial,InstanceType='z1d',FreeTierEligible=False,SupportedUsageClasses='spot',BareMetal=True)
[{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]

我的问题如下: 我想用在一个列表中具有不同规则的所有筛选器筛选列表

但我正在失去可读性,我正在创建一个意大利面代码。请告诉我更好的设计决策

from typing import List


def ec2keys(*arg) -> List:
    values = {'str': ['InstanceType', 'Hypervisor'], 'bool': ['FreeTierEligible', 'HibernationSupported', 'CurrentGeneration', 'BurstablePerformanceSupported', 'AutoRecoverySupported', 'DedicatedHostsSupported', 'InstanceStorageSupported', 'BareMetal'], 'list': ['SupportedUsageClasses', 'SupportedRootDeviceTypes'], 'dict': ['InstanceStorageInfo', 'VCpuInfo', 'EbsInfo', 'FpgaInfo', 'PlacementGroupInfo', 'GpuInfo', 'InferenceAcceleratorInfo', 'MemoryInfo', 'NetworkInfo', 'ProcessorInfo'], 'other': []} 
    return [elem for k,v in  values.items() if k in arg or not arg for elem in v]

def ec_compare__from_dict(_partial: List,**kwargs):
    _instance_type = kwargs.get('InstanceType')
    flat_keys = set(ec2keys('str', 'bool')).intersection(
        set(kwargs.keys())) - {'InstanceType'}
    complex_filter_keys = set(ec2keys()).intersection(
        set(kwargs.keys()))
    list_keys_dict = {k: list(
        (lambda x: x if isinstance(x, list) else [x])(kwargs.get(k)))
        for k in set(ec2keys('list')).intersection(
            set(kwargs.keys()))
    }
    # here I started with list comprehension
    _partial = [x for x in _partial
                if all(elem in x.keys() for elem in flat_keys)
                and all(elem in x.keys() for elem in complex_filter_keys)
                and all(x[elem] == kwargs[elem] for elem in flat_keys)
                ]
    # this is re-apply filter again to all elements 
    if isinstance(_instance_type, str) and _instance_type:
        _partial = [x for x in _partial
                    if str(x['InstanceType']).startswith(_instance_type)
                    ]
    elif isinstance(_instance_type, (list, set)) and _instance_type:
        _partial = [x for x in _partial
                    if any(str(x['InstanceType']).startswith(elem)
                           for elem in _instance_type)
                    ]

    # this is how I filter list values
    if list_keys_dict:
        _partial = [x for x in _partial
                    if any(set(x[k]).intersection(v) for k, v in list_keys_dict.items())
                    ]
    return _partial

示例数据

_partial = [{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]


由于嵌套列表和dict结构,我认为类比较不是最简单的。但是在类比较中,您可以分别为每个项生成一个比较方法,这将把大函数分解为许多小函数。如果接口发生变化,这将导致一些维护问题

在这种情况下,您的字典比较方法更好,但我将使用递归对嵌套字典进行重写。通过使用递归,可以稍微简化嵌套

通过使用您提供的输入:

data = {
    'InstanceType': 'z1d.metal',
    'CurrentGeneration': True,
    'FreeTierEligible': False,
    'SupportedUsageClasses': ['on-demand', 'spot'],
    'SupportedRootDeviceTypes': ['ebs'],
    'BareMetal': True,
    'ProcessorInfo': {'SupportedArchitectures': ['x86_64'],
                      'SustainedClockSpeedInGhz': 4.0},
    'VCpuInfo': {'DefaultVCpus': 48},
    'MemoryInfo': {'SizeInMiB': 393216},
    'InstanceStorageSupported': True,
    'InstanceStorageInfo': {'TotalSizeInGB': 1800,
                            'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]},
    'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'},
    'NetworkInfo': {'NetworkPerformance': '25 Gigabit',
                    'MaximumNetworkInterfaces': 15,
                    'Ipv4AddressesPerInterface': 50,
                    'Ipv6AddressesPerInterface': 50,
                    'Ipv6Supported': True,
                    'EnaSupport': 'required'},
    'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']},
    'HibernationSupported': False,
    'BurstablePerformanceSupported': False,
    'DedicatedHostsSupported': True,
    'AutoRecoverySupported': False}
我生成了几个可能的过滤器(有效的为True,无效的为False):

然后我们将逐个元素比较这两个字典,包括嵌套元素。 为此,使用以下功能:

def verify_element(original, check) -> bool:

    # Compare the types
    if type(original) != type(check):
        return False

    # recursively call this function for every element in the dictionary (if key exists)
    if isinstance(check, dict):
        for key, value in check.items():
            if key not in original:
                return False
            if not verify_element(value, original[key]):
                return False
        return True

    # The value inside check has to occur in any of the original elements
    # This behaviour is required, because we do not know where the check elements is positioned.
    if isinstance(check, (tuple, list)):
        for element in check:
            if not any(verify_element(each, element) for each in original):
                return False
        return True

    # Verify the element directly.
    if isinstance(check, (str, bool, int, float)):
        return original == check

    # Handle any unknown data types.
    raise TypeError(f"Type {type(check)}, with value {check} cannot be compared.")
要相互比较这两个词典,最终检查将如下所示:

if __name__ == '__main__':
    print(verify_element(data, data_check_valid))             # True
    print(verify_element(data, data_check_invalid_strategy))  # False

    print(verify_element(data, data_check_invalid_count))     # False
    # When you change 'Count' to 2, the answer will become    # True

如果你想干净地使用它,你可以把它放在一个类中,并使用上面的函数分别比较每个元素。这使得还可以包含自定义验证器,例如应该大于或小于原始值(这在上述代码中目前是不可能的)。

是否可以创建一个类实例?然后你可以在那个类中建立一个比较函数。@Thymen你能举个例子吗?你的意思是实现类并重载leuuuuuu/\uuu ne方法,然后[elem for elem in elements if class(elem)==class(另一个)]?是的,这正是我的意思,但只要
\uuuuuu eq\uuuu
\uuuuuuuuq\uu
。如果所有元素实例都相等,这将起作用。
if __name__ == '__main__':
    print(verify_element(data, data_check_valid))             # True
    print(verify_element(data, data_check_invalid_strategy))  # False

    print(verify_element(data, data_check_invalid_count))     # False
    # When you change 'Count' to 2, the answer will become    # True