删除python行中n个gram中的重复项
此代码生成n克,并显示n克的计数数。 我有一个csv文件,其中包含行和一列,每行包含字符串。 例如,当它搜索得到一个4克的“这是我的小狗”时,它也会计算它在同一行中出现的次数。 我的意图是,当它在一行中出现n-gram时,它应该在另一行中计数一次,然后再计数第二次,以此类推删除python行中n个gram中的重复项,python,n-gram,Python,N Gram,此代码生成n克,并显示n克的计数数。 我有一个csv文件,其中包含行和一列,每行包含字符串。 例如,当它搜索得到一个4克的“这是我的小狗”时,它也会计算它在同一行中出现的次数。 我的意图是,当它在一行中出现n-gram时,它应该在另一行中计数一次,然后再计数第二次,以此类推 e.g row Word 1 this is my puppy what this is my puppy 2 this is my puppy
e.g row Word
1 this is my puppy what this is my puppy
2 this is my puppy
所以这个代码将“这是我的小狗”计算为3次。但我希望它是2倍
这是python代码
import collections
import re
import sys
import time
def tokenize(string):
"""Convert string to lowercase and split into words (ignoring
punctuation), returning list of words.
"""
return re.findall(r'\w+', string.lower())
def count_ngrams(lines, min_length=4, max_length=5):
"""Iterate through given lines iterator (file object or list of
lines) and return n-gram frequencies. The return value is a dict
mapping the length of the n-gram to a collections.Counter
object of n-gram tuple and number of times that n-gram occurred.
Returned dict includes n-grams of length min_length to max_length.
"""
lengths = range(min_length, max_length + 1)
ngrams = {length: collections.Counter() for length in lengths}
queue = collections.deque(maxlen=max_length)
# Helper function to add n-grams at start of current queue to dict
def add_queue():
current = tuple(queue)
for length in lengths:
if len(current) >= length:
ngrams[length][current[:length]] += 1
# Loop through all lines and words and add n-grams to dict
for line in lines:
for word in tokenize(line):
queue.append(word)
if len(queue) >= max_length:
add_queue()
# Make sure we get the n-grams at the tail end of the queue
while len(queue) > min_length:
queue.popleft()
add_queue()
return ngrams
def print_most_frequent(ngrams, num=10):
"""Print num most common n-grams of each length in n-grams dict."""
for n in sorted(ngrams):
print('----- {} most common {}-grams -----'.format(num, n))
for gram, count in ngrams[n].most_common(num):
print('{0}: {1}'.format(' '.join(gram), count))
print('')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: python ngrams.py filename')
sys.exit(1)
start_time = time.time()
with open("PWorm.csv") as f:
ngrams = count_ngrams(f)
print_most_frequent(ngrams)
elapsed_time = time.time() - start_time
print('Took {:.03f} seconds'.format(elapsed_time))
导入集合
进口稀土
导入系统
导入时间
def标记化(字符串):
“”“将字符串转换为小写并拆分为单词(忽略
(标点符号),返回单词列表。
"""
返回re.findall(r'\w+',string.lower())
def计数RAM(线路,最小长度=4,最大长度=5):
“”“迭代给定的行迭代器(文件对象或
行)和返回n-gram频率。返回值为dict
将n-gram的长度映射到collections.Counter
n-gram元组的对象和n-gram发生的次数。
返回的dict包括从最小长度到最大长度的n克长度。
"""
长度=范围(最小长度,最大长度+1)
ngrams={length:collections.Counter()表示长度(以长度为单位)}
queue=collections.deque(maxlen=max_length)
#Helper函数将当前队列开始处的n-gram添加到dict
def add_queue():
当前=元组(队列)
对于长度:
如果len(当前)>=长度:
ngrams[length][current[:length]]+=1
#循环浏览所有行和单词,并在dict中添加n-gram
对于行中的行:
对于标记化中的单词(第行):
queue.append(word)
如果len(queue)>=最大长度:
添加队列()
#确保我们在队列的末尾得到n-gram
当长度(队列)>最小长度时:
queue.popleft()
添加队列()
返回内存
def打印最频繁(ngrams,num=10):
“”“打印n-grams dict中每种长度最常见的n-grams的数量。”
对于已分拣的n(ngrams):
打印('----{}最常见的{}-grams---'.格式(num,n))
对于gram,以ngrams[n]为单位计数。最常见(num):
打印('{0}:{1}'。格式(''.join(gram),count))
打印(“”)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
如果len(系统argv)<2:
打印('用法:python ngrams.py文件名')
系统出口(1)
开始时间=time.time()
以open(“PWorm.csv”)作为f:
ngrams=计数_ngrams(f)
打印最频繁(ngrams)
已用时间=time.time()-开始时间
打印({.03f}秒)。格式(已用时间)
我们将非常感谢你的帮助。
谢谢您可以使用
defaultdict
为了防止一行中的同一个ngram计数两次,您必须为每行生成一个ngram dict,然后将其与通用ngram dict相结合
def count_ngrams(lines, min_length=4, max_length=5):
"""Iterate through given lines iterator (file object or list of
lines) and return n-gram frequencies. The return value is a dict
mapping the length of the n-gram to a collections.Counter
object of n-gram tuple and number of times that n-gram occurred.
Returned dict includes n-grams of length min_length to max_length.
"""
lengths = range(min_length, max_length + 1)
ngrams = collections.defaultdict(collections.Counter)
queue = collections.deque(maxlen=max_length)
# Helper function to add n-grams at start of current queue to dict
def add_queue(ngrams_line):
current = tuple(queue)
for length in lengths:
if len(current) >= length:
ngrams_line[length][current[:length]] = 1 # instead of += 1
# to combine the 2 defaultdict(Counter)
def combine_ngrams(ngram, ngramline):
for k, v in ngramsline.items():
ngrams[k] += v
return ngrams
# Loop through all lines and words and add n-grams to dict
for line in lines:
ngrams_line = collections.defaultdict(collections.Counter)
for word in tokenize(line):
queue.append(word)
if len(queue) >= max_length:
add_queue(ngrams_line)
ngrams = combine_ngrams(ngrams, ngrams_line)
# Make sure we get the n-grams at the tail end of the queue
ngrams_line = collections.defaultdict(collections.Counter)
while len(queue) > min_length:
queue.popleft()
add_queue(ngrams_line)
ngrams = combine_ngrams(ngrams, ngrams_line)
return ngrams
当len(queue)>min_length:时,我不能100%理解
之后的部分,或者为什么队列
没有得到重置everyline,您可能需要稍微调整我的答案非常感谢您的帮助。它按照我需要的方式工作。再次非常感谢