Python 如何通过多个规则有效地过滤巨大的列表?
我正在编写一个开源的PyPi包,它应该能够过滤AWS EC2实例 在我的函数ec_compare__from_dict中,我过滤了磁盘上占用364Kb的350多个元素的列表 以下执行示例返回1个筛选元素:Python 如何通过多个规则有效地过滤巨大的列表?,python,python-3.x,design-patterns,list-comprehension,Python,Python 3.x,Design Patterns,List Comprehension,我正在编写一个开源的PyPi包,它应该能够过滤AWS EC2实例 在我的函数ec_compare__from_dict中,我过滤了磁盘上占用364Kb的350多个元素的列表 以下执行示例返回1个筛选元素: >>> ec_compare__from_dict(_partial=_partial,InstanceType='z1d',FreeTierEligible=False,SupportedUsageClasses='spot',BareMetal=True) [{'Inst
>>> ec_compare__from_dict(_partial=_partial,InstanceType='z1d',FreeTierEligible=False,SupportedUsageClasses='spot',BareMetal=True)
[{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]
我的问题如下:
我想用在一个列表中具有不同规则的所有筛选器筛选列表
但我正在失去可读性,我正在创建一个意大利面代码。请告诉我更好的设计决策
from typing import List
def ec2keys(*arg) -> List:
values = {'str': ['InstanceType', 'Hypervisor'], 'bool': ['FreeTierEligible', 'HibernationSupported', 'CurrentGeneration', 'BurstablePerformanceSupported', 'AutoRecoverySupported', 'DedicatedHostsSupported', 'InstanceStorageSupported', 'BareMetal'], 'list': ['SupportedUsageClasses', 'SupportedRootDeviceTypes'], 'dict': ['InstanceStorageInfo', 'VCpuInfo', 'EbsInfo', 'FpgaInfo', 'PlacementGroupInfo', 'GpuInfo', 'InferenceAcceleratorInfo', 'MemoryInfo', 'NetworkInfo', 'ProcessorInfo'], 'other': []}
return [elem for k,v in values.items() if k in arg or not arg for elem in v]
def ec_compare__from_dict(_partial: List,**kwargs):
_instance_type = kwargs.get('InstanceType')
flat_keys = set(ec2keys('str', 'bool')).intersection(
set(kwargs.keys())) - {'InstanceType'}
complex_filter_keys = set(ec2keys()).intersection(
set(kwargs.keys()))
list_keys_dict = {k: list(
(lambda x: x if isinstance(x, list) else [x])(kwargs.get(k)))
for k in set(ec2keys('list')).intersection(
set(kwargs.keys()))
}
# here I started with list comprehension
_partial = [x for x in _partial
if all(elem in x.keys() for elem in flat_keys)
and all(elem in x.keys() for elem in complex_filter_keys)
and all(x[elem] == kwargs[elem] for elem in flat_keys)
]
# this is re-apply filter again to all elements
if isinstance(_instance_type, str) and _instance_type:
_partial = [x for x in _partial
if str(x['InstanceType']).startswith(_instance_type)
]
elif isinstance(_instance_type, (list, set)) and _instance_type:
_partial = [x for x in _partial
if any(str(x['InstanceType']).startswith(elem)
for elem in _instance_type)
]
# this is how I filter list values
if list_keys_dict:
_partial = [x for x in _partial
if any(set(x[k]).intersection(v) for k, v in list_keys_dict.items())
]
return _partial
示例数据
_partial = [{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]
由于嵌套列表和dict结构,我认为类比较不是最简单的。但是在类比较中,您可以分别为每个项生成一个比较方法,这将把大函数分解为许多小函数。如果接口发生变化,这将导致一些维护问题 在这种情况下,您的字典比较方法更好,但我将使用递归对嵌套字典进行重写。通过使用递归,可以稍微简化嵌套 通过使用您提供的输入:
data = {
'InstanceType': 'z1d.metal',
'CurrentGeneration': True,
'FreeTierEligible': False,
'SupportedUsageClasses': ['on-demand', 'spot'],
'SupportedRootDeviceTypes': ['ebs'],
'BareMetal': True,
'ProcessorInfo': {'SupportedArchitectures': ['x86_64'],
'SustainedClockSpeedInGhz': 4.0},
'VCpuInfo': {'DefaultVCpus': 48},
'MemoryInfo': {'SizeInMiB': 393216},
'InstanceStorageSupported': True,
'InstanceStorageInfo': {'TotalSizeInGB': 1800,
'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]},
'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'},
'NetworkInfo': {'NetworkPerformance': '25 Gigabit',
'MaximumNetworkInterfaces': 15,
'Ipv4AddressesPerInterface': 50,
'Ipv6AddressesPerInterface': 50,
'Ipv6Supported': True,
'EnaSupport': 'required'},
'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']},
'HibernationSupported': False,
'BurstablePerformanceSupported': False,
'DedicatedHostsSupported': True,
'AutoRecoverySupported': False}
我生成了几个可能的过滤器(有效的为True,无效的为False):
然后我们将逐个元素比较这两个字典,包括嵌套元素。
为此,使用以下功能:
def verify_element(original, check) -> bool:
# Compare the types
if type(original) != type(check):
return False
# recursively call this function for every element in the dictionary (if key exists)
if isinstance(check, dict):
for key, value in check.items():
if key not in original:
return False
if not verify_element(value, original[key]):
return False
return True
# The value inside check has to occur in any of the original elements
# This behaviour is required, because we do not know where the check elements is positioned.
if isinstance(check, (tuple, list)):
for element in check:
if not any(verify_element(each, element) for each in original):
return False
return True
# Verify the element directly.
if isinstance(check, (str, bool, int, float)):
return original == check
# Handle any unknown data types.
raise TypeError(f"Type {type(check)}, with value {check} cannot be compared.")
要相互比较这两个词典,最终检查将如下所示:
if __name__ == '__main__':
print(verify_element(data, data_check_valid)) # True
print(verify_element(data, data_check_invalid_strategy)) # False
print(verify_element(data, data_check_invalid_count)) # False
# When you change 'Count' to 2, the answer will become # True
如果你想干净地使用它,你可以把它放在一个类中,并使用上面的函数分别比较每个元素。这使得还可以包含自定义验证器,例如应该大于或小于原始值(这在上述代码中目前是不可能的)。是否可以创建一个类实例?然后你可以在那个类中建立一个比较函数。@Thymen你能举个例子吗?你的意思是实现类并重载leuuuuuu/\uuu ne方法,然后[elem for elem in elements if class(elem)==class(另一个)]?是的,这正是我的意思,但只要
\uuuuuu eq\uuuu
和\uuuuuuuuq\uu
。如果所有元素实例都相等,这将起作用。
if __name__ == '__main__':
print(verify_element(data, data_check_valid)) # True
print(verify_element(data, data_check_invalid_strategy)) # False
print(verify_element(data, data_check_invalid_count)) # False
# When you change 'Count' to 2, the answer will become # True