Python 3.x 查找文件中最常用的单词
我有一个文件,我想找出其中最常见的10个单词。我省略了停止词和标点符号,然后将结果放入列表中。每行包含一个波斯语句子,一个制表符,然后是一个英语单词。问题是,下面的代码每行返回一个单词。例如,如果行数为12,则返回12个字。我认为压痕有问题。我怎样才能修好它Python 3.x 查找文件中最常用的单词,python-3.x,nlp,Python 3.x,Nlp,我有一个文件,我想找出其中最常见的10个单词。我省略了停止词和标点符号,然后将结果放入列表中。每行包含一个波斯语句子,一个制表符,然后是一个英语单词。问题是,下面的代码每行返回一个单词。例如,如果行数为12,则返回12个字。我认为压痕有问题。我怎样才能修好它 . . . def train (): RemStopWords (file1, file2) # the function for removing stop words and punctuation at the start
.
.
.
def train ():
RemStopWords (file1, file2) # the function for removing stop words and punctuation at the start of the code
for line in witoutStops:
line = line.strip().split("\t")
words = line[0].split()
uniques = []
q = []
for word in words:
if word not in uniques:
uniques.append(word)
counts = []
for unique in uniques:
count = 0
for word in words:
if word == unique:
count += 1
counts.append((count, unique))
counts.sort()
counts.reverse()
for i in range(min(10, len(counts))):
count, word = counts[i]
print('%s %d' % (word, count))
#q.append(word)
#print (q)
您可以为此使用:
from collections import Counter
def train ():
RemStopWords (file1, file2) # the function for removing stop words and punctuation at the start of the code
counter = Counter()
for line in withoutStops:
line = line.strip().split("\t")
words = line[0].split()
counter.update(words)
top10 = [word[0] for word in counter.most_common(10)]
print(top10)
基于优先级队列、映射和三值的C++解决方案 下面是使用优先级队列、MAP和TIE的类似C++代码。为了简单起见,它可以从向量字符串中读取,但可以很容易地修改为从文件中读取单词
在文件或流中找到顶部k个频繁字,C++ 这是优先级队列的工作解决方案,供您参考
#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
using namespace std;
#define K_TH 3
class TrieNode;
typedef struct HeapNode
{
string word;
int frequency;
HeapNode(): frequency(0), word(""){} ;
TrieNode *trieNode;
}HeapNode;
class TrieNode
{
private:
int frequency = 0;
bool m_isLeaf = false;
string word = "";
unordered_map<char, TrieNode*> children;
HeapNode *heapNode = NULL;
public:
TrieNode() {}
TrieNode(char c)
{
children[c] = new TrieNode();
this->m_isLeaf = false;
}
void setWord(string word)
{
this->word = word;
}
string getWord()
{
return this->word;
}
bool isLeaf(void)
{
return this->m_isLeaf;
}
void setLeaf(bool leaf)
{
this->m_isLeaf = leaf;
}
TrieNode* getChild(char c)
{
if (children[c] != NULL)
return children[c];
return NULL;
}
void insert(char c)
{
children[c] = new TrieNode();
}
int getFrequency()
{
return this->frequency;
}
void setFrequency(int frequency)
{
this->frequency = frequency;
}
void setHeapNode(HeapNode *heapNode)
{
this->heapNode = heapNode;
}
HeapNode* getHeapNode()
{
return heapNode;
}
bool operator()(HeapNode* &a, HeapNode* &b)
{
return (a->frequency > b->frequency);
}
};
class Trie
{
private:
TrieNode *root = NULL;
public:
Trie()
{
if (!root)
{
this->root = new TrieNode();
}
}
TrieNode* insert(string word)
{
if (!root)
root = new TrieNode();
TrieNode* current = root;
int length = word.length();
//insert "abc"
for(int i = 0; i < length; ++i)
{
if (current->getChild(word.at(i)) == NULL)
{
current->insert(word.at(i));
}
current = current->getChild(word.at(i));
}
current->setLeaf(true);
current->setWord(word);
current->setFrequency(current->getFrequency() + 1);
return current;
}
};
struct cmp
{
bool operator()(HeapNode* &a, HeapNode* &b)
{
return (a->frequency > b->frequency);
}
};
typedef priority_queue<HeapNode*, vector<HeapNode*>, cmp > MinHeap;
void insertUtils(Trie *root, MinHeap &pq, string word )
{
if (!root)
return;
TrieNode* current = root->insert(word);
HeapNode *heapNode = current->getHeapNode();
if(heapNode)// if word already present in heap
{
heapNode->frequency += 1;
}else if (pq.empty() || pq.size() < K_TH)
{// if word not present in heap and heap is not full;
heapNode = new HeapNode();
heapNode->word = word;
heapNode->frequency = 1;
heapNode->trieNode = current;
current->setHeapNode(heapNode);
pq.push(heapNode);
}else if (pq.top()->frequency < current->getFrequency())
{ // if word is not present and heap is full;
HeapNode *temp = pq.top();
//remove first element and add current word
pq.pop();
delete temp;
heapNode = new HeapNode();
current->setHeapNode(heapNode);
pq.push(heapNode);
}
}
void printKMostFrequentWords(vector<std::string> input)
{
Trie *root = new Trie();
MinHeap minHeap;
for (vector<string>::iterator it = input.begin(); it != input.end(); ++it)
{
insertUtils(root, minHeap, *it);
}
while(!minHeap.empty())
{
HeapNode *heapNode = minHeap.top();
cout << heapNode->word << ":" << heapNode->frequency << endl;
minHeap.pop();
}
}
int main() {
vector<std::string>input( {
"abc", "def", "ghi",
"jkl", "abc", "def",
"mno", "xyz", "abc"
} ) ;
printKMostFrequentWords(input);
}
#包括
#包括
#包括
#包括
使用名称空间std;
#定义K_TH 3
类三元组;
类型定义结构HeapNode
{
字符串字;
整数频率;
HeapNode():频率(0),单词(“”{};
三元组*三元组;
}HeapNode;
类三节点
{
私人:
整数频率=0;
bool m_isLeaf=false;
字串=”;
无序映射子对象;
HeapNode*HeapNode=NULL;
公众:
三节点(){}
三节点(字符c)
{
儿童[c]=新三元组();
此->m_isLeaf=false;
}
无效设置字(字符串字)
{
这个->字=字;
}
字符串getWord()
{
返回此->单词;
}
布尔岛(空)
{
返回此->m_isLeaf;
}
无效设置叶(布尔叶)
{
这->m_isLeaf=leaf;
}
三元组*getChild(字符c)
{
if(children[c]!=NULL)
返回儿童[c];
返回NULL;
}
空白插入(字符c)
{
儿童[c]=新三元组();
}
int getFrequency()
{
返回此->频率;
}
无效设置频率(整数频率)
{
这个->频率=频率;
}
void setHeapNode(HeapNode*HeapNode)
{
此->heapNode=heapNode;
}
HeapNode*getHeapNode()
{
返回heapNode;
}
布尔运算符()(HeapNode*&a、HeapNode*&b)
{
返回(a->frequency>b->frequency);
}
};
三类
{
私人:
三节点*根=空;
公众:
Trie()
{
如果(!root)
{
此->根=新的三节点();
}
}
三节点*插入(字符串字)
{
如果(!root)
根=新的三节点();
三极*电流=根;
int length=word.length();
//插入“abc”
对于(int i=0;igetChild(word.at(i))==NULL)
{
当前->插入(word.at(i));
}
current=current->getChild(word.at(i));
}
当前->设置叶(真);
当前->设置字(字);
当前->设置频率(当前->获取频率()+1);
回流;
}
};
结构cmp
{
布尔运算符()(HeapNode*&a、HeapNode*&b)
{
返回(a->frequency>b->frequency);
}
};
typedef优先级_队列MinHeap;
void insertUtils(Trie*root、MinHeap和pq、字符串字)
{
如果(!root)
返回;
三节点*当前=根->插入(字);
HeapNode*HeapNode=current->getHeapNode();
if(heapNode)//如果堆中已存在单词
{
heapNode->频率+=1;
}else if(pq.empty()| | pq.size()word=word;
heapNode->frequency=1;
heapNode->trieNode=当前;
当前->设置heapNode(heapNode);
pq.push(heapNode);
}否则如果(pq.top()->频率<当前->获取频率())
{//如果word不存在且堆已满;
HeapNode*temp=pq.top();
//删除第一个元素并添加当前单词
pq.pop();
删除临时文件;
heapNode=新的heapNode();
当前->设置heapNode(heapNode);
pq.push(heapNode);
}
}
无效printKMostFrequentWords(矢量输入)
{
Trie*root=新的Trie();
小堆小堆;
for(vector::iterator it=input.begin();it!=input.end();++it)
{
insertUtils(root、minHeap、*it);
}
而(!minHeap.empty())
{
HeapNode*HeapNode=minHeap.top();
这个词行得通,但我只需要这些词。我怎样才能把这些词添加到列表中呢?谢谢。我已经把它添加到我的答案中了,莎拉