Python 用于计算输入文件中句子、单词和字符数的代码_Python_Nltk

Python 用于计算输入文件中句子、单词和字符数的代码

python

Python 用于计算输入文件中句子、单词和字符数的代码,python,nltk,Python,Nltk,我编写了以下代码来计算输入文件sample.txt中的句子、单词和字符数，该文件包含一段文本。它可以很好地给出句子和单词的数量，但不能给出精确和正确的字符数（没有空格和标点符号） textf.close（）打印'-'*50 打印“行：”，行打印“空白行：”，空白行打印“句子：”，句子打印“单词：”，单词导入nltk 导入nltk.data 导入nltk.tokenize 以open（'sample.txt'，'r'）作为f：对于f中的行： num_chars+=len（行） num

我编写了以下代码来计算输入文件sample.txt中的句子、单词和字符数，该文件包含一段文本。它可以很好地给出句子和单词的数量，但不能给出精确和正确的字符数（没有空格和标点符号）

textf.close（）

打印'-'*50 打印“行：”，行打印“空白行：”，空白行打印“句子：”，句子打印“单词：”，单词

导入nltk 导入nltk.data 导入nltk.tokenize

以open（'sample.txt'，'r'）作为f：对于f中的行： num_chars+=len（行）

num_chars=num_chars-（单词+1）

pcount=0 从nltk.tokenize导入树bankWordTokenizer 以open（'sample.txt'，'r'）作为f1：对于f1中的行： #tokenized_words=nltk.tokenize.word_tokenize（行）标记器=TreebankWordTokenizer（） tokenized_words=tokenizer.tokenize（行）对于w，用标记化的单词表示：如果（（w=='。）|（w=='）|（w=='！）|（w=='？）： pcount=pcount+1 打印“pcount:”，pcount num\u chars=num\u chars-pcount 打印“字符：”，num_字符

pcount是标点符号的数量。有人能建议我需要做哪些更改，以便找出没有空格和标点符号的字符的确切数量吗？

您可以做的一件事是当您阅读该行时，迭代并增加字符数：

import string

#
# Per-line counting functions
#
def countLines(ln):      return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln):      return len(ln.split())

def charCounter(validChars):
    vc = set(validChars)
    def counter(ln):
        return sum(1 for ch in ln if ch in vc)
    return counter
countSentences = charCounter('.!?')
countLetters   = charCounter(string.letters)
countPunct     = charCounter(string.punctuation)

#
# do counting
#
class FileStats(object):
    def __init__(self, countFns, labels=None):
        super(FileStats,self).__init__()
        self.fns    = countFns
        self.labels = labels if labels else [fn.__name__ for fn in countFns]
        self.reset()

    def reset(self):
        self.counts = [0]*len(self.fns)

    def doFile(self, fname):
        try:
            with open(fname) as inf:
                for line in inf:
                    for i,fn in enumerate(self.fns):
                        self.counts[i] += fn(line)
        except IOError:
            print('Could not open file {0} for reading'.format(fname))

    def __str__(self):
        return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))

fs = FileStats(
    (countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
    ("Lines",    "Blank Lines",   "Sentences",    "Words",    "Letters",    "Punctuation")
)
fs.doFile('sample.txt')
print(fs)

另外，您可能需要更改if语句条件以满足您的特殊需要，例如，如果您想计算$。

您可以做的一件事是在读取该行时对其进行迭代并增加字符数：

import string

#
# Per-line counting functions
#
def countLines(ln):      return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln):      return len(ln.split())

def charCounter(validChars):
    vc = set(validChars)
    def counter(ln):
        return sum(1 for ch in ln if ch in vc)
    return counter
countSentences = charCounter('.!?')
countLetters   = charCounter(string.letters)
countPunct     = charCounter(string.punctuation)

#
# do counting
#
class FileStats(object):
    def __init__(self, countFns, labels=None):
        super(FileStats,self).__init__()
        self.fns    = countFns
        self.labels = labels if labels else [fn.__name__ for fn in countFns]
        self.reset()

    def reset(self):
        self.counts = [0]*len(self.fns)

    def doFile(self, fname):
        try:
            with open(fname) as inf:
                for line in inf:
                    for i,fn in enumerate(self.fns):
                        self.counts[i] += fn(line)
        except IOError:
            print('Could not open file {0} for reading'.format(fname))

    def __str__(self):
        return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))

fs = FileStats(
    (countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
    ("Lines",    "Blank Lines",   "Sentences",    "Words",    "Letters",    "Punctuation")
)
fs.doFile('sample.txt')
print(fs)

另外，您可能需要更改if语句条件以满足您的特殊需要，例如，如果要计算$。

您还可以使用正则表达式替换所有非字母数字字符，然后计算每行中的字符数。

您还可以使用正则表达式替换所有非字母数字字符，然后计算每行中的字符数

Lines                   101
Blank Lines              12
Sentences                48
Words                   339
Letters                1604
Punctuation             455

导致

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:

    print(i) #array contain with tokens
    print(str(len(i))) #word count

    for j in i:
        if j== 'words': #simple algo for count number of 'words' to be count
            x = x+1

tokenized_sents = [sent_tokenize(k) for k in lines]

for  k in tokenized_sents:
    print("Sentences"+str(k)) #array contain with sentences
    print("number of sentences "+str(len(k))) #number of sentences

print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))

导致

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:

    print(i) #array contain with tokens
    print(str(len(i))) #word count

    for j in i:
        if j== 'words': #simple algo for count number of 'words' to be count
            x = x+1

tokenized_sents = [sent_tokenize(k) for k in lines]

for  k in tokenized_sents:
    print("Sentences"+str(k)) #array contain with sentences
    print("number of sentences "+str(len(k))) #number of sentences

print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))

试着计算单词数和句子数，得到相似单词的概率

这是家庭作业吗？如果不是的话，我很确定你只需要几行shell脚本就可以得到这个答案。这是家庭作业吗？如果没有，我很确定你只需要几行shell脚本就可以得到这个答案。