Python 如何加速组合算法？_Python_String_Set_Combinations_Itertools

Python 如何加速组合算法？

python string

Python 如何加速组合算法？,python,string,set,combinations,itertools,Python,String,Set,Combinations,Itertools,下面的代码查找列表B中构成字符串A的最小项。让我们假设A='hello world你好'和B='hello world你好'，'hello are'，'hello'，'hello you doing']。然后，由于索引为0和3的项目包含字符串A的所有单词，因此答案将是2 我将所有字符串转换为整数以加快算法的速度，但是由于有更大更复杂的测试用例，我需要更优化的算法。我想知道如何加速这个算法 import itertools A='hello world how are you doing' B=

下面的代码查找列表B中构成字符串A的最小项。让我们假设

A='hello world你好'

和

B='hello world你好'，'hello are'，'hello'，'hello you doing']

。然后，由于索引为0和3的项目包含字符串

的所有单词，因此答案将是2

我将所有字符串转换为整数以加快算法的速度，但是由于有更大更复杂的测试用例，我需要更优化的算法。我想知道如何加速这个算法

import itertools

A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']

d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()]
mapping = dict(zip(A.split(), range(1, len(A) + 1)))

# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]

set_a = set(res_A)
solved = False

for L in range(0, len(res_B)+1):
    for subset in itertools.combinations(res_B, L):
        s = set(item for sublist in subset for item in sublist)
        if set_a.issubset(s):
            print(f'{L}')
            solved = True
            break
    if solved: break

我在remove_sub上犯了一个逻辑错误，不知道为什么它仍然有效

尝试清理数据并从b中减少尽可能多的项

import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy

A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']

d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))

# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)

# my adding works on list of sets
for i in range(len(res_B)):
    res_B[i] = set(res_B[i])

# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a  = set(a)
b = []
for i in range(200):
    size = np.random.randint(0,20)
    b.append(set(np.random.choice(a,size)))

# till here, created a,b for larger data test

def f1(set_a, b):
    solved = False
    for L in range(0, len(b)+1):
        for subset in it.combinations(b, L):
            s = set(item for sublist in subset for item in sublist)
            if set_a.issubset(s):
                print(f'{L}','**************f1')
                solved = True
                break
        if solved: break

def rare(b):
    c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
    items = dd(list) # dict where the key is num and value is list of index where this num exist in b
    for i in range(len(b)):
        c.update(b[i]) 
        for num in b[i]:
            items[num].append(i)

    rare = set()
    common = c.most_common() #return sorted list of tuples with a number and how many times it appear
    for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
        if common[-i][1] ==1:
            rare.add(common[0])
            continue
        break

    rare_items = {} # a set of all index that have rare number in them
    for k in rare: 
        rare_items.update(items[k])

    values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
    for i in rare_items:
        values_from_rare_items.update(b[i])

    for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
        b.pop(i)
    return values_from_rare_items,b, len(rare_items)

#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it 
def remove_sub(b):
    to_pop = set()
    t = copy.deepcopy(b)
    for i in range(len(b)):
        for j in range(len(t)):
            if i ==j:
                continue
            if b[i] == t[j]:
                to_pop.add(i)
                continue
            if b[i].issubset(t[j]):
                to_pop.add(i)
            if t[j].issubset(b[i]):
                to_pop.add(j)
    for i in reversed(sorted(to_pop)):
        b.pop(i)
    return b

def f2(set_a, b):
    b1 = remove_sub(b)
    values_from_rare_items,b2, num_rare_items = rare(b)
    a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
    

    solved = False
    for L in range(0, len(b2)+1):
        for subset in it.combinations(b2, L):
            s = set(item for sublist in subset for item in sublist)
            if a_without_rare.issubset(s):
                length = L+num_rare_items 
                print(f'{length}', "*********f2")
                solved = True
                break
        if solved: break


s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')

s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')


s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')

s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')

您可以通过选取所有只出现几次的项目，并将其视为只出现一次来进一步改进它，在极少数情况下，它不会是真正的最小值，但时间改进是显著的

您的问题并不完全清楚。秩序重要吗？不相关的单词是否可以出现在句子的某些部分（例如，你们是否仍然接受“你好，你们两个在做什么？”）顺序并不重要。列表B中没有不相关的单词。这是一个集合覆盖问题。这是NP难的，所以暴力不是个好主意。动态编程方法可能会更好。我尝试过DP，暴力是最好的方法。如果“目标”字符串或单个列表项中有重复的单词，该怎么办？您的代码让我有点困惑。请你用我的字符串来生成随机数，并将它们转换成整数（与我的代码相同），然后将其输入到算法。B上是否有不出现在a上的单词？添加了一个验证，任何方法的想法都是尽量减少B，使所有组合~O（2^n）和n中的任何减少都是有意义的，并尽量减少a，以便在早期迭代中捕获它

this is the out put

2 **************f1
0.16755199432373047 ********************f1 num_array

2 *********f2
0.09078240394592285 ******************f2 num_array

2 **************f1
0.0009989738464355469 ********************f1 your_data

2 *********f2
0.0009975433349609375 ******************f2 your_data