删除python行中n个gram中的重复项

删除python行中n个gram中的重复项,python,n-gram,Python,N Gram,此代码生成n克,并显示n克的计数数。 我有一个csv文件,其中包含行和一列,每行包含字符串。 例如,当它搜索得到一个4克的“这是我的小狗”时,它也会计算它在同一行中出现的次数。 我的意图是,当它在一行中出现n-gram时,它应该在另一行中计数一次,然后再计数第二次,以此类推 e.g row Word 1 this is my puppy what this is my puppy 2 this is my puppy

此代码生成n克,并显示n克的计数数。 我有一个csv文件,其中包含行和一列,每行包含字符串。 例如,当它搜索得到一个4克的“这是我的小狗”时,它也会计算它在同一行中出现的次数。 我的意图是,当它在一行中出现n-gram时,它应该在另一行中计数一次,然后再计数第二次,以此类推

e.g  row         Word
      1          this is my puppy what this is my puppy
      2          this is my puppy
所以这个代码将“这是我的小狗”计算为3次。但我希望它是2倍

这是python代码

import collections
import re
import sys
import time


def tokenize(string):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    return re.findall(r'\w+', string.lower())


def count_ngrams(lines, min_length=4, max_length=5):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length: 
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            queue.append(word)
            if len(queue) >= max_length:
                    add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return ngrams


def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-grams -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Usage: python ngrams.py filename')
        sys.exit(1)

    start_time = time.time()
    with open("PWorm.csv") as f:
        ngrams = count_ngrams(f)
    print_most_frequent(ngrams)
    elapsed_time = time.time() - start_time
    print('Took {:.03f} seconds'.format(elapsed_time))
导入集合
进口稀土
导入系统
导入时间
def标记化(字符串):
“”“将字符串转换为小写并拆分为单词(忽略
(标点符号),返回单词列表。
"""
返回re.findall(r'\w+',string.lower())
def计数RAM(线路,最小长度=4,最大长度=5):
“”“迭代给定的行迭代器(文件对象或
行)和返回n-gram频率。返回值为dict
将n-gram的长度映射到collections.Counter
n-gram元组的对象和n-gram发生的次数。
返回的dict包括从最小长度到最大长度的n克长度。
"""
长度=范围(最小长度,最大长度+1)
ngrams={length:collections.Counter()表示长度(以长度为单位)}
queue=collections.deque(maxlen=max_length)
#Helper函数将当前队列开始处的n-gram添加到dict
def add_queue():
当前=元组(队列)
对于长度:
如果len(当前)>=长度:
ngrams[length][current[:length]]+=1
#循环浏览所有行和单词,并在dict中添加n-gram
对于行中的行:
对于标记化中的单词(第行):
queue.append(word)
如果len(queue)>=最大长度:
添加队列()
#确保我们在队列的末尾得到n-gram
当长度(队列)>最小长度时:
queue.popleft()
添加队列()
返回内存
def打印最频繁(ngrams,num=10):
“”“打印n-grams dict中每种长度最常见的n-grams的数量。”
对于已分拣的n(ngrams):
打印('----{}最常见的{}-grams---'.格式(num,n))
对于gram,以ngrams[n]为单位计数。最常见(num):
打印('{0}:{1}'。格式(''.join(gram),count))
打印(“”)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
如果len(系统argv)<2:
打印('用法:python ngrams.py文件名')
系统出口(1)
开始时间=time.time()
以open(“PWorm.csv”)作为f:
ngrams=计数_ngrams(f)
打印最频繁(ngrams)
已用时间=time.time()-开始时间
打印({.03f}秒)。格式(已用时间)
我们将非常感谢你的帮助。
谢谢

您可以使用
defaultdict

为了防止一行中的同一个ngram计数两次,您必须为每行生成一个ngram dict,然后将其与通用ngram dict相结合

def count_ngrams(lines, min_length=4, max_length=5):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = collections.defaultdict(collections.Counter)
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue(ngrams_line):
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length: 
                ngrams_line[length][current[:length]] = 1  # instead of += 1

    # to combine the 2 defaultdict(Counter)            
    def combine_ngrams(ngram, ngramline):
        for k, v in ngramsline.items():
            ngrams[k] += v
        return ngrams

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        ngrams_line = collections.defaultdict(collections.Counter)
        for word in tokenize(line):
            queue.append(word)
            if len(queue) >= max_length:
                    add_queue(ngrams_line)
        ngrams = combine_ngrams(ngrams, ngrams_line)


    # Make sure we get the n-grams at the tail end of the queue
    ngrams_line = collections.defaultdict(collections.Counter)
    while len(queue) > min_length:
        queue.popleft()
        add_queue(ngrams_line)
    ngrams = combine_ngrams(ngrams, ngrams_line)

    return ngrams

当len(queue)>min_length:时,我不能100%理解
之后的部分,或者为什么
队列
没有得到重置everyline,您可能需要稍微调整我的答案

非常感谢您的帮助。它按照我需要的方式工作。再次非常感谢