如何改进用python编写的算法?
我已经做了一个算法,将文本分割成n-GRMA(搭配),并计算出这种搭配的概率和其他统计数据。当文件超过50兆字节时,计数需要很长时间,也许有人会帮助改进它如何改进用python编写的算法?,python,algorithm,nlp,Python,Algorithm,Nlp,我已经做了一个算法,将文本分割成n-GRMA(搭配),并计算出这种搭配的概率和其他统计数据。当文件超过50兆字节时,计数需要很长时间,也许有人会帮助改进它 import math import re import csv from itertools import zip_longest from datetime import datetime def tokenize(input_file, encoding): lst =[] with open(input_file,
import math
import re
import csv
from itertools import zip_longest
from datetime import datetime
def tokenize(input_file, encoding):
lst =[]
with open(input_file, 'r', encoding=encoding) as f:
for sent in f:
sent = sent.lower()
sent = re.sub("[A-z0-9\'\"`\|\/\+\#\,\)\(\?\!\B\-\:\=\;\.\«\»\—\@]", '', sent)
sent = re.findall('\w+', sent)
for word in sent:
lst.append(word)
return lst
def ngrams_split(lst, n):
counts = dict()
grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
for gram in grams:
if gram not in counts:
counts[gram] = 1
else:
counts[gram] += 1
return counts
def list_add(counts):
ngrams = []
for key, val in counts.items():
ngrams.append((val, key))
return ngrams
def gram_add(lst, n):
ng = []
grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
for gram in grams:
ng.append(gram)
return ng
def two_gram_count(input_file, encoding, n_filter, n):
output_file = []
lst = tokenize(input_file, encoding) #tokenize
n_words = len(lst)
counts = ngrams_split(lst, n) #spliting into ngrams
ngrams = list_add(counts) #ading ngrmas to list
for key, val in ngrams:
if int(key) >= n_filter:
ngram_freq = math.log(key/n_words)
num = key*n_words
f1 = lst.count(val.split()[0])
f2 = lst.count(val.split()[1])
mi = math.pow(math.log(num/(f1*f2), 10), 2)
ngram_prob = math.log(key/f1, 10)
output_file.append((ngram_freq, mi, ngram_prob, key, val))
return output_file
def three_gram_count(input_file, encoding, n_filter, n):
output_file = []
lst = tokenize(input_file, encoding) #tokenize
n_words = len(lst)
counts = ngrams_split(lst, n) #spliting into ngrams
ngrams = list_add(counts) #ading ngrmas to list
ng = gram_add(lst, 2)
for key, val in ngrams:
if int(key) >= n_filter:
ngram_freq = math.log(key/n_words, 10)
num = key*n_words
c2gram = ng.count(val.split()[0] + " " + val.split()[1])
f1 = lst.count(val.split()[0])
f2 = lst.count(val.split()[1])
f3 = lst.count(val.split()[2])
mi = math.pow(math.log(num/(f1*f2*f3), 10), 2)
ngram_prob = math.log(key/c2gram, 10)
output_file.append((ngram_freq, mi, ngram_prob, key, val))
return output_file
def four_grams_count(input_file, encoding, n_filter, n):
output_file = []
lst = tokenize(input_file, encoding) #tokenize
n_words = len(lst)
counts = ngrams_split(lst, n) #spliting into ngrams
ngrams = list_add(counts) #ading ngrmas to list
ng2 = gram_add(lst, 2)
for key, val in ngrams:
if int(key) >= n_filter:
ngram_freq = math.log(key/n_words, 10)
num = key*n_words
c1gram = ng2.count(val.split()[0] + " " + val.split()[1])
c2gram = ng2.count(val.split()[1] + " " + val.split()[2])
c3gram = ng2.count(val.split()[2] + " " + val.split()[3])
f1 = lst.count(val.split()[0])
f2 = lst.count(val.split()[1])
f3 = lst.count(val.split()[2])
f4 = lst.count(val.split()[3])
mi = math.pow(math.log(num/(f1*f2*f3*f4), 10), 2)
prob1 = c1gram/f1
prob2 = c2gram/f2
prob3 = c3gram/f3
ngram_prob = math.log(prob1, 10) + math.log(prob2, 10) + math.log(prob3, 10)
output_file.append((ngram_freq, mi, ngram_prob, key, val))
return output_file
def n_grams_stat(input_file, encoding, n_filter, n):
output_file = []
if n == 2:
for i in two_gram_count(input_file, encoding, n_filter, n):
output_file.append(i)
elif n == 3:
for i in three_gram_count(input_file, encoding, n_filter, n):
output_file.append(i)
elif n == 4:
for i in four_grams_count(input_file, encoding, n_filter, n):
output_file.append(i)
return output_file
start_time = datetime.now()
for a, b, c, d, e in n_grams_stat("/home/yan/PycharmProjects/vk/piidsluhano/men_pidsluhano.txt",'utf-8', n_filter=3, n=4):
print(a, b, c, d, e)
with open("/home/yan/PycharmProjects/vk/piidsluhano/men_4grams", 'dwwaa') as f:
f.write(str(a) +", "+ str(b) + ', '+ str(c) + ", " + str(d) + ", " + str(e) + '\n ')
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
如果代码正常工作,那么您了解自己正在做什么以及正在发生什么,也许这个问题更适合您?为了获得更好的性能,您应该使用set()而不是list()。对于Python2 xrange(),而不是range()。你为什么需要添加列表?你也可以直接使用dict。我想它的社区较小,但由于它更贴近你的需要,它可能会给你更好的质量回复。你应该试着运行Python profiler,找出代码中哪些部分占用了最多的计算时间。看看