用Python快速计算频率
我需要计算语料库中单词的频率。通常我使用collections包中的Counter类用Python快速计算频率,python,list,collections,counter,Python,List,Collections,Counter,我需要计算语料库中单词的频率。通常我使用collections包中的Counter类 from collections import Counter list_of_words = ['one', 'two', 'three', 'three'] freqs = Counter(list_of_words) 然而,我正在分析的语料库由数百万个单词组成,因此如果有一种更快的方法来计算这些分数,那该多好啊 以下是用以下文字表示的代码: from read_cg3 import read_cg3
from collections import Counter
list_of_words = ['one', 'two', 'three', 'three']
freqs = Counter(list_of_words)
然而,我正在分析的语料库由数百万个单词组成,因此如果有一种更快的方法来计算这些分数,那该多好啊
以下是用以下文字表示的代码:
from read_cg3 import read_cg3
test = read_cg3('/Users/arashsaidi/Work/Corpus/DUO_Corpus/Bokmaal-tagged-random/DUO_BM_0.txt')
count = 0
word_list = []
for sentence in test:
for word in sentence:
count += 1
word_list.append(word)
print count
read_cg3是一个模块,用于读取解析文件并返回句子列表。以下是模块:
import re
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def read_cg3(cg3_file):
"""
Reads a cg3 file and returns a list of each sentence with Token, parsed, and one tag
:param cg3_file: path to file
:return: list of words + attributes
"""
rx_token = re.compile("^\"<(.+?)>\"$")
rx_attributes = re.compile("^\s+\".+?\"\s+.+$")
rx_eos = re.compile("^\s*$")
curr_token = None
curr_word = []
curr_sentence = []
result = []
with open(cg3_file) as cg3_file:
for line in cg3_file:
if rx_token.match(line):
curr_token = "\"%s\"" % rx_token.match(line).group(1)
# print curr_token
if rx_attributes.match(line):
curr_word = line.split()
# print curr_word[0], curr_word[1]
# print curr_word
if curr_token and curr_word:
# to get more tags uncomment this and comment below
# curr_sentence += [[curr_token] + curr_word]
if '$' not in curr_word[0] and not is_number(curr_word[0].strip('"').replace('.', '')) \
and len(curr_word[0]) < 30:
# curr_sentence += [[curr_token.strip('"')] +
# [curr_word[0].lower().strip('"')] + [curr_word[1]]]
curr_sentence += [curr_word[0].lower().strip('"')]
curr_token = None
curr_word = []
if rx_eos.match(line):
# print curr_sentence
if curr_sentence:
result += [curr_sentence]
curr_sentence = []
curr_token = None
curr_word = []
# cleanup if last sentence not EOL
if curr_token and curr_word:
print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
curr_sentence += [[curr_token] + curr_word]
if curr_sentence:
print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
result += curr_sentence
return result
重新导入
def是_编号:
尝试:
浮球
返回真值
除值错误外:
返回错误
def read_cg3(cg3_文件):
"""
读取cg3文件并返回每个句子的列表,其中包含标记、已解析和一个标记
:param cg3_文件:文件的路径
:return:单词+属性列表
"""
rx\u令牌=重新编译(“^\”\“$”)
rx\u attributes=re.compile(“^\s+\”+?\“\s+.+$”)
rx_eos=re.compile(“^\s*$”)
当前令牌=无
curr_word=[]
当前句子=[]
结果=[]
打开(cg3_文件)作为cg3_文件:
对于cg3_文件中的行:
如果rx_令牌匹配(行):
curr\u token=“\%s\”“%rx\u token.match(line).group(1)
#打印货币令牌
如果rx_属性匹配(行):
curr_word=line.split()
#打印当前单词[0],当前单词[1]
#打印当前单词
如果当前标记和当前单词:
#要获取更多标记,请取消注释此标记并在下面进行注释
#货币句子+=[[货币标记]+货币单词]
如果“$”不在当前单词[0]中且不是当前单词编号(当前单词[0]。带(“”)。替换(“,”)\
和len(curr_word[0])<30:
#curr_语句+=[[curr_token.strip(“”)]+
#[curr_-word[0].lower().strip(“”)]+[curr_-word[1]]
curr_语句+=[curr_单词[0]。lower().strip(“”)]
当前令牌=无
curr_word=[]
如果rx_eos.匹配(行):
#打印当前句子
如果是当前句子:
结果+=[当前句子]
当前句子=[]
当前令牌=无
curr_word=[]
#如果最后一句话不是EOL,则清除
如果当前标记和当前单词:
打印“cg3已到达文件末尾,并对文件{}进行了一些清理”。格式(cg3_文件)
货币句子+=[[货币标记]+货币单词]
如果是当前句子:
打印“cg3已到达文件末尾,并对文件{}进行了一些清理”。格式(cg3_文件)
结果+=当前句子
返回结果
以下是read_cg3读取文件的方式:
"<TEKNOLOGI>"
"teknologi" subst appell mask ub ent
"<OG>"
"og" konj <*>
"<UNDERVISNING>"
"undervisning" subst appell fem ub ent <*>
"<|>"
"$|" clb <overskrift> <<<
"<En>"
"en" det mask ent kvant
"<intervjuunders¯kelse>"
"intervjuunders¯kelse" subst appell mask ub ent
"<av>"
"av" prep
"<musikklÊreres>"
"musikklÊrer" subst appell mask ub fl gen
"<didaktiske>"
"didaktisk" adj fl pos
"<bruk>"
"bruk" subst appell mask ub ent
"<av>"
"av" prep
"<digitale>"
"digital" adj fl pos
"<verkt¯y>"
"verkt¯y" subst appell n¯yt ub fl <*¯y>
"<i>"
"i" prep
"<undervisningsfaget>"
"undervisningsfag" subst appell n¯yt be ent
"<komposisjon>"
"komposisjon" subst appell mask ub ent
"<i>"
"i" prep
"<videregÂende>"
"videregÂende" adj ub m/f ent pos
"<skole>"
"skole" subst appell mask ub ent
"<|>"
"$|" clb <overskrift> <<<
"<Markus>"
"Markus" subst prop mask
"<A.>"
"A." subst prop fork <*>
"<SkjÊrstad>"
"SkjÊrstad" subst prop <*stad> <*>
"<|>"
"$|" clb <overskrift> <<<
“”
“技术”副产品
""
“og”konj
""
“欠激励”子项出现
""
“$|”clb您可以使用用C编译的内置count
函数:
dict((i,test_list.count(i)) for i in set(test_list))
为了更好地理解,您可以查看以下基准测试:
from timeit import timeit
s1="""l=[1, 1, 1, 2, 3, 4, 1, 2, 5, 7, 2, 3]
from collections import Counter
Counter(l)"""
s2="""l=[1, 1, 1, 2, 3, 4, 1, 2, 5, 7, 2, 3]
dict((i,l.count(i)) for i in set(l))"""
print 'using Counter : ' ,timeit(stmt=s1, number=1000000)
print 'using built-in : ',timeit(stmt=s2, number=1000000)
结果:
using Counter : 8.78281712532
using built-in : 2.91788387299
看起来您不需要使用令牌,而且您的正则表达式可以被删除。这将计算每个单词在每个文件中出现的次数:
import multiprocessing as mp
import os
import itertools
def wordCounter(qIn, qOut):
answer = {}
for fname, words in iter(qIn.get, None):
for word in words:
if fname not in answer:
answer[fname] = {}
if word not in answer[fname]:
answer[fname][word] = 0
answer[fname][word] += 1
qOut.put(answer)
def getLines(corpusPath, qIn, numProcs):
for fname in os.listdir(corpusPath):
with open(os.path.join(corpusPath, fname)) as infile:
for i, (k,lines) in enumerate(itertools.groupby((l.strip() for l in infile), lambda line : bool(line) and not line.startswith('"<') and "$" not in line.split(None,1)[0])):
if not k:
continue
qIn.put((fname, [line.split(None,1)[0].strip('"').strip().lower() for line in lines]))
for _ in range(numProcs):
qIn.put(None)
def main(corpusPath):
qIn, qOut = [mp.Queue() for _ in range(2)]
procs = [mp.Process(target=wordCounter, args=(qIn, qOut)) for _ in range(mp.cpu_count() -1)]
lineGetter = mp.Process(target=getLines, args=(corpusPath, qIn, len(procs)))
lineGetter.start()
for p in procs:
p.start()
answer = {}
for _ in range(len(procs)):
for fname, wdict in qOut.get().items():
if fname not in answer:
answer[fname] = {}
for word,count in wdict.items():
if word not in answer[fname]:
answer[fname][word] = 0
answer[fname][word] += count
for fname in sorted(answer):
for word in sorted(answer[fname]):
print("{} appeared in {} {} times".format(word, fname, answer[fname][word]))
for p in procs:
p.terminate()
lineGetter.terminate()
将多处理导入为mp
导入操作系统
进口itertools
def字计数器(qIn,qOut):
答案={}
对于fname,iter中的文字(qIn.get,无):
用文字表示:
如果fname未在回答中:
答案[fname]={}
如果答案中没有单词[fname]:
答案[fname][word]=0
答案[fname][word]+=1
qOut.put(回答)
def getLines(小体路径、秦、numProcs):
对于os.listdir(微粒路径)中的fname:
以open(os.path.join(corpusPath,fname))作为内嵌:
对于enumerate(itertools.groupby((l.strip()表示infle中的l)中的i,(k,line),lambda line:bool(line)而不是line.startswith(“”),仅使用Python stdlib?使用scikit learn?您分析了代码吗?瓶颈在哪里?您采取了哪些步骤来优化它?如何“更快”"测量?我正在为您开发一个mapreduce解决方案。文件存储在哪里?您如何从文件中读取单词?单词存储在解析文件中,我使用一种方法提取每个单词并在列表中读取这些单词。文件存储在我的硬盘上。因此,我现在所做的基本上是读取解析文件中的单词,得到t请发布读取文件的代码,提取每个单词并在列表中读取这些单词。请注意,Python下的计数器
速度要快得多3@dawg不,它比Python2快,但不是内置的:这是Python3.4的结果(stmt=s1,number=1000000)6.76945222500035>>timeit(stmt=s2,number=1000000)4.520381794999594
谢谢,这起作用了,必须运行一些测试,看看速度有多快!