Python 如何加速组合算法?
下面的代码查找列表Python 如何加速组合算法?,python,string,set,combinations,itertools,Python,String,Set,Combinations,Itertools,下面的代码查找列表B中构成字符串A的最小项。让我们假设A='hello world你好'和B='hello world你好','hello are','hello','hello you doing']。然后,由于索引为0和3的项目包含字符串A的所有单词,因此答案将是2 我将所有字符串转换为整数以加快算法的速度,但是由于有更大更复杂的测试用例,我需要更优化的算法。我想知道如何加速这个算法 import itertools A='hello world how are you doing' B=
B
中构成字符串A
的最小项。让我们假设A='hello world你好'
和B='hello world你好','hello are','hello','hello you doing']
。然后,由于索引为0和3的项目包含字符串A
的所有单词,因此答案将是2
我将所有字符串转换为整数以加快算法的速度,但是由于有更大更复杂的测试用例,我需要更优化的算法。我想知道如何加速这个算法
import itertools
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()]
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
solved = False
for L in range(0, len(res_B)+1):
for subset in itertools.combinations(res_B, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}')
solved = True
break
if solved: break
我在remove_sub上犯了一个逻辑错误,不知道为什么它仍然有效 尝试清理数据并从b中减少尽可能多的项
import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
# my adding works on list of sets
for i in range(len(res_B)):
res_B[i] = set(res_B[i])
# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a = set(a)
b = []
for i in range(200):
size = np.random.randint(0,20)
b.append(set(np.random.choice(a,size)))
# till here, created a,b for larger data test
def f1(set_a, b):
solved = False
for L in range(0, len(b)+1):
for subset in it.combinations(b, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}','**************f1')
solved = True
break
if solved: break
def rare(b):
c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
items = dd(list) # dict where the key is num and value is list of index where this num exist in b
for i in range(len(b)):
c.update(b[i])
for num in b[i]:
items[num].append(i)
rare = set()
common = c.most_common() #return sorted list of tuples with a number and how many times it appear
for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
if common[-i][1] ==1:
rare.add(common[0])
continue
break
rare_items = {} # a set of all index that have rare number in them
for k in rare:
rare_items.update(items[k])
values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
for i in rare_items:
values_from_rare_items.update(b[i])
for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
b.pop(i)
return values_from_rare_items,b, len(rare_items)
#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it
def remove_sub(b):
to_pop = set()
t = copy.deepcopy(b)
for i in range(len(b)):
for j in range(len(t)):
if i ==j:
continue
if b[i] == t[j]:
to_pop.add(i)
continue
if b[i].issubset(t[j]):
to_pop.add(i)
if t[j].issubset(b[i]):
to_pop.add(j)
for i in reversed(sorted(to_pop)):
b.pop(i)
return b
def f2(set_a, b):
b1 = remove_sub(b)
values_from_rare_items,b2, num_rare_items = rare(b)
a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
solved = False
for L in range(0, len(b2)+1):
for subset in it.combinations(b2, L):
s = set(item for sublist in subset for item in sublist)
if a_without_rare.issubset(s):
length = L+num_rare_items
print(f'{length}', "*********f2")
solved = True
break
if solved: break
s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')
s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')
您可以通过选取所有只出现几次的项目,并将其视为只出现一次来进一步改进它,在极少数情况下,它不会是真正的最小值,但时间改进是显著的您的问题并不完全清楚。秩序重要吗?不相关的单词是否可以出现在句子的某些部分(例如,你们是否仍然接受“你好,你们两个在做什么?”)顺序并不重要。列表B中没有不相关的单词。这是一个集合覆盖问题。这是NP难的,所以暴力不是个好主意。动态编程方法可能会更好。我尝试过DP,暴力是最好的方法。如果“目标”字符串或单个列表项中有重复的单词,该怎么办?您的代码让我有点困惑。请你用我的字符串来生成随机数,并将它们转换成整数(与我的代码相同),然后将其输入到算法。B上是否有不出现在a上的单词?添加了一个验证,任何方法的想法都是尽量减少B,使所有组合~O(2^n)和n中的任何减少都是有意义的,并尽量减少a,以便在早期迭代中捕获它
this is the out put
2 **************f1
0.16755199432373047 ********************f1 num_array
2 *********f2
0.09078240394592285 ******************f2 num_array
2 **************f1
0.0009989738464355469 ********************f1 your_data
2 *********f2
0.0009975433349609375 ******************f2 your_data