用Python快速计算频率_Python_List_Collections_Counter

用Python快速计算频率

python list collections

用Python快速计算频率,python,list,collections,counter,Python,List,Collections,Counter,我需要计算语料库中单词的频率。通常我使用collections包中的Counter类 from collections import Counter list_of_words = ['one', 'two', 'three', 'three'] freqs = Counter(list_of_words) 然而，我正在分析的语料库由数百万个单词组成，因此如果有一种更快的方法来计算这些分数，那该多好啊以下是用以下文字表示的代码： from read_cg3 import read_cg3

我需要计算语料库中单词的频率。通常我使用collections包中的Counter类

from collections import Counter
list_of_words = ['one', 'two', 'three', 'three']
freqs = Counter(list_of_words)

然而，我正在分析的语料库由数百万个单词组成，因此如果有一种更快的方法来计算这些分数，那该多好啊

以下是用以下文字表示的代码：

from read_cg3 import read_cg3

test = read_cg3('/Users/arashsaidi/Work/Corpus/DUO_Corpus/Bokmaal-tagged-random/DUO_BM_0.txt')
count = 0
word_list = []
for sentence in test:
    for word in sentence:
        count += 1
        word_list.append(word)
print count

read_cg3是一个模块，用于读取解析文件并返回句子列表。以下是模块：

import re


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def read_cg3(cg3_file):
    """
    Reads a cg3 file and returns a list of each sentence with Token, parsed, and one tag
    :param cg3_file: path to file
    :return: list of words + attributes
    """
    rx_token = re.compile("^\"<(.+?)>\"$")
    rx_attributes = re.compile("^\s+\".+?\"\s+.+$")
    rx_eos = re.compile("^\s*$")

    curr_token = None
    curr_word = []
    curr_sentence = []
    result = []

    with open(cg3_file) as cg3_file:
        for line in cg3_file:

            if rx_token.match(line):
                curr_token = "\"%s\"" % rx_token.match(line).group(1)
                # print curr_token

            if rx_attributes.match(line):
                curr_word = line.split()
                # print curr_word[0], curr_word[1]
                # print curr_word
                if curr_token and curr_word:
                    # to get more tags uncomment this and comment below
                    # curr_sentence += [[curr_token] + curr_word]
                    if '$' not in curr_word[0] and not is_number(curr_word[0].strip('"').replace('.', '')) \
                            and len(curr_word[0]) < 30:
                        # curr_sentence += [[curr_token.strip('"')] +
                        # [curr_word[0].lower().strip('"')] + [curr_word[1]]]
                        curr_sentence += [curr_word[0].lower().strip('"')]
                    curr_token = None
                    curr_word = []

            if rx_eos.match(line):
                # print curr_sentence
                if curr_sentence:
                    result += [curr_sentence]
                curr_sentence = []
                curr_token = None
                curr_word = []

    # cleanup if last sentence not EOL
    if curr_token and curr_word:
        print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
        curr_sentence += [[curr_token] + curr_word]

    if curr_sentence:
        print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
        result += curr_sentence

    return result

重新导入
def是_编号：
尝试：
浮球
返回真值
除值错误外：
返回错误
def read_cg3（cg3_文件）：
"""
读取cg3文件并返回每个句子的列表，其中包含标记、已解析和一个标记
：param cg3_文件：文件的路径
：return：单词+属性列表
"""
rx\u令牌=重新编译（“^\”\“$”）
rx\u attributes=re.compile（“^\s+\”+？\“\s+.+$”）
rx_eos=re.compile（“^\s*$”）
当前令牌=无
curr_word=[]
当前句子=[]
结果=[]
打开（cg3_文件）作为cg3_文件：
对于cg3_文件中的行：
如果rx_令牌匹配（行）：
curr\u token=“\%s\”“%rx\u token.match（line）.group（1）
#打印货币令牌
如果rx_属性匹配（行）：
curr_word=line.split（）
#打印当前单词[0]，当前单词[1]
#打印当前单词
如果当前标记和当前单词：
#要获取更多标记，请取消注释此标记并在下面进行注释
#货币句子+=[[货币标记]+货币单词]
如果“$”不在当前单词[0]中且不是当前单词编号（当前单词[0]。带（“”）。替换（“，”）\
和len（curr_word[0]）<30：
#curr_语句+=[[curr_token.strip（“”）]+
#[curr_-word[0].lower（）.strip（“”）]+[curr_-word[1]]
curr_语句+=[curr_单词[0]。lower（）.strip（“”）]
当前令牌=无
curr_word=[]
如果rx_eos.匹配（行）：
#打印当前句子
如果是当前句子：
结果+=[当前句子]
当前句子=[]
当前令牌=无
curr_word=[]
#如果最后一句话不是EOL，则清除
如果当前标记和当前单词：
打印“cg3已到达文件末尾，并对文件{}进行了一些清理”。格式（cg3_文件）
货币句子+=[[货币标记]+货币单词]
如果是当前句子：
打印“cg3已到达文件末尾，并对文件{}进行了一些清理”。格式（cg3_文件）
结果+=当前句子
返回结果

以下是read_cg3读取文件的方式：

"<TEKNOLOGI>"
    "teknologi" subst appell mask ub ent 
"<OG>"
    "og" konj <*> 
"<UNDERVISNING>"
    "undervisning" subst appell fem ub ent <*> 
"<|>"
    "$|" clb <overskrift> <<< 

"<En>"
    "en" det mask ent kvant 
"<intervjuunders¯kelse>"
    "intervjuunders¯kelse" subst appell mask ub ent 
"<av>"
    "av" prep 
"<musikklÊreres>"
    "musikklÊrer" subst appell mask ub fl gen 
"<didaktiske>"
    "didaktisk" adj fl pos 
"<bruk>"
    "bruk" subst appell mask ub ent 
"<av>"
    "av" prep 
"<digitale>"
    "digital" adj fl pos 
"<verkt¯y>"
    "verkt¯y" subst appell n¯yt ub fl <*¯y> 
"<i>"
    "i" prep 
"<undervisningsfaget>"
    "undervisningsfag" subst appell n¯yt be ent 
"<komposisjon>"
    "komposisjon" subst appell mask ub ent 
"<i>"
    "i" prep 
"<videregÂende>"
    "videregÂende" adj ub m/f ent pos 
"<skole>"
    "skole" subst appell mask ub ent 
"<|>"
    "$|" clb <overskrift> <<< 

"<Markus>"
    "Markus" subst prop mask 
"<A.>"
    "A." subst prop fork <*> 
"<SkjÊrstad>"
    "SkjÊrstad" subst prop <*stad> <*> 
"<|>"
    "$|" clb <overskrift> <<<

“”
“技术”副产品
""
“og”konj
""
“欠激励”子项出现
""
“$|”clb您可以使用用C编译的内置count
函数：
dict((i,test_list.count(i)) for i in set(test_list))

为了更好地理解，您可以查看以下基准测试：
from timeit import timeit

s1="""l=[1, 1, 1, 2, 3, 4, 1, 2, 5, 7, 2, 3]
from collections import Counter
Counter(l)"""
s2="""l=[1, 1, 1, 2, 3, 4, 1, 2, 5, 7, 2, 3]
dict((i,l.count(i)) for i in set(l))"""

print 'using Counter : ' ,timeit(stmt=s1, number=1000000)
print 'using built-in : ',timeit(stmt=s2, number=1000000)

结果:
using Counter :  8.78281712532
using built-in :  2.91788387299

看起来您不需要使用令牌，而且您的正则表达式可以被删除。这将计算每个单词在每个文件中出现的次数：
import multiprocessing as mp
import os
import itertools

def wordCounter(qIn, qOut):
    answer = {}
    for fname, words in iter(qIn.get, None):
        for word in words:
            if fname not in answer:
                answer[fname] = {}
            if word not in answer[fname]:
                answer[fname][word] = 0 
            answer[fname][word] += 1
    qOut.put(answer)


def getLines(corpusPath, qIn, numProcs):
    for fname in os.listdir(corpusPath):
        with open(os.path.join(corpusPath, fname)) as infile:
            for i, (k,lines) in enumerate(itertools.groupby((l.strip() for l in infile), lambda line : bool(line) and not line.startswith('"<') and "$" not in line.split(None,1)[0])):
                if not k:
                    continue
                qIn.put((fname, [line.split(None,1)[0].strip('"').strip().lower() for line in lines]))

    for _ in range(numProcs):
        qIn.put(None)


def main(corpusPath):
    qIn, qOut = [mp.Queue() for _ in range(2)]
    procs = [mp.Process(target=wordCounter, args=(qIn, qOut)) for _ in range(mp.cpu_count() -1)]

    lineGetter = mp.Process(target=getLines, args=(corpusPath, qIn, len(procs)))
    lineGetter.start()

    for p in procs:
        p.start()

    answer = {}
    for _ in range(len(procs)):
        for fname, wdict in qOut.get().items():
            if fname not in answer:
                answer[fname] = {}
            for word,count in wdict.items():
                if word not in answer[fname]:
                    answer[fname][word] = 0 
                answer[fname][word] += count

    for fname in sorted(answer):
        for word in sorted(answer[fname]):
            print("{} appeared in {} {} times".format(word, fname, answer[fname][word]))

    for p in procs:
        p.terminate()
    lineGetter.terminate()

将多处理导入为mp
导入操作系统
进口itertools
def字计数器（qIn，qOut）：
答案={}
对于fname，iter中的文字（qIn.get，无）：
用文字表示：
如果fname未在回答中：
答案[fname]={}
如果答案中没有单词[fname]：
答案[fname][word]=0
答案[fname][word]+=1
qOut.put（回答）
def getLines（小体路径、秦、numProcs）：
对于os.listdir（微粒路径）中的fname：
以open（os.path.join（corpusPath，fname））作为内嵌：
对于enumerate（itertools.groupby（（l.strip（）表示infle中的l）中的i，（k，line），lambda line:bool（line）而不是line.startswith（“”），仅使用Python stdlib？使用scikit learn？您分析了代码吗？瓶颈在哪里？您采取了哪些步骤来优化它？如何“更快”"测量？我正在为您开发一个mapreduce解决方案。文件存储在哪里？您如何从文件中读取单词？单词存储在解析文件中，我使用一种方法提取每个单词并在列表中读取这些单词。文件存储在我的硬盘上。因此，我现在所做的基本上是读取解析文件中的单词，得到t请发布读取文件的代码，提取每个单词并在列表中读取这些单词。请注意，Python下的计数器
速度要快得多3@dawg不，它比Python2快，但不是内置的：这是Python3.4的结果（stmt=s1，number=1000000）6.76945222500035>>timeit（stmt=s2，number=1000000）4.520381794999594
谢谢，这起作用了，必须运行一些测试，看看速度有多快！