C++11 C+中文本分类的朴素贝叶斯实现+;
我正在编写一个代码,用于实现用于文本分类的朴素贝叶斯分类器。我曾经做过一个很小的例子,它似乎在起作用C++11 C+中文本分类的朴素贝叶斯实现+;,c++11,naivebayes,C++11,Naivebayes,我正在编写一个代码,用于实现用于文本分类的朴素贝叶斯分类器。我曾经做过一个很小的例子,它似乎在起作用 但我想知道实现是否正确,它是否适用于其他培训和测试集?我不是试图实现一个商业级朴素贝叶斯,只是一个小任务,学习一些C++。李> 我想知道代码是怎么写的?就像我编写代码的方式一样,它是一个很好的C++实践吗? 我知道有很多改进可以做,比如现在我只测试一个测试文件,所以一种测试多个文件的方法是我想在将来做的事情,而且目前我只做2类分类,将来可能是多类分类。但在代码方面还有其他改进吗 以下是代码,NB
#pragma once
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<map>
using namespace std;
class NB
{
public:
NB(NB& cl1, NB& cl2, string className);
NB(string className);
NB(string className, int classType);
vector <string> combineClassText();
void bagOfWords(string classCombine, bool isTotal = false);
void calcProb(NB& total);
float totalProb(NB& prob, NB& total);
int classType;
private:
int _len = 0;
float _prob = 1.0f;
int _voc = 0;
int _nOfClass = 0;
int _tnClass = 0;
int _totalWordsinC = 0;
int _wordCounter = 0;
bool _isDone = false;
ifstream _in;
ofstream _out;
//string _classCombine;
string _className;
string _fileName;
vector <string> _combined;
map<string, string> _category;
map<string, int> _bow;
map<string, float> _probCalc;
};
#include "NB.h"
#include<cmath>
NB::NB(NB& cl1, NB& cl2, string className)
{
_className = className;
_out.open("combineAll.txt");
if (_out.fail()) {
perror("cannot write to combineAll.txt");
}
_len = cl1.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl1.combineClassText()[i]);
}
_len = cl2.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl2.combineClassText()[i]);
}
_len = _combined.size();
for (int i = 0; i < _len; i++) {
_out << _combined[i] << endl;
//cout << i + 1 << ". " << _combined[i] << endl;
}
_out.close();
_tnClass = cl1._tnClass + cl2._tnClass;
bagOfWords("combineAll.txt", true);
}
NB::NB(string className, int classType) {
NB::classType = classType;
_className = className;
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
_category[_fileName] = _className;
combineClassText();
bagOfWords(_className + ".txt");
}
NB::NB(string className)
{
_className = className;
while (_isDone == false) {
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
if (_fileName != "q") {
_category[_fileName] = _className;
_nOfClass++;
_tnClass++;
} else {
_isDone = true;
}
}
combineClassText();
bagOfWords(_className + ".txt");
}
vector<string> NB::combineClassText() {
string temp;
string classCombine = _className + ".txt";
vector <string> tmp;
map<string, string>::iterator it;
_out.open(classCombine);
if (_out.fail()) {
perror("cannot write to");
}
for (it = _category.begin(); it != _category.end(); it++) {
_in.open(it->first);
if (_in.fail()) {
perror("cannot read from");
}
while (_in >> temp) {
_out << temp << endl;
tmp.push_back(temp);
}
_in.close();
}
_out.close();
return tmp;
}
void NB::bagOfWords(string classCombine, bool isTotal) {
map<string, int>::iterator it;
string temp;
vector<string> tp;
string name = _className + "_bow.txt";
int len;
_in.open(classCombine);
if (_in.fail()) {
perror("cannot read from");
}
_out.open(name);
if (_out.fail()) {
perror("cannot write to");
}
while (_in >> temp) {
tp.push_back(temp);
}
for (int i = 0; i < tp.size(); i++) {
for (int j = 0; j < tp[i].size(); j++) {
if (tp[i][j] == '.' || tp[i][j] == ',') {
tp[i][j] = ' ';
}
}
}
len = tp.size();
vector<int> count(len, 1);
for (int i = 0; i < len; i++) {
for (int j = 0; j < (len - i - 1); j++) {
if (tp[i] == tp[j + i + 1]) {
count[i]++;
}
}
}
for (int i = len - 1; i >= 0; i--) {
_bow[tp[i]] = count[i];
}
for (it = _bow.begin(); it != _bow.end(); it++) {
_out << it->first << ": " << it->second << endl;
//cout << it->first << ": " << it->second << endl;
}
//cout << endl;
if (isTotal == true) {
for (it = _bow.begin(); it != _bow.end(); it++) {
_voc += 1;
//cout << _voc << endl;
}
} else {
for (it = _bow.begin(); it != _bow.end(); it++) {
_totalWordsinC += it->second;
}
//cout << _totalWordsinC << endl;
}
_in.close();
_out.close();
}
void NB::calcProb(NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
for (it = total._bow.begin(); it != total._bow.end(); it++) {
for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
if (it->first == it2->first) {
_probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
break;
} else {
_probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);
}
}
}
for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
//cout << it3->first << ": " << it3->second << endl;
_out << it3->first << ": " << it3->second << endl;
}
_out.close();
}
float NB::totalProb(NB& prob, NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_" + prob._className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
_prob = 1.0f;
for (it = _bow.begin(); it != _bow.end(); it++) {
for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
if (it->first == it3->first) {
_wordCounter = 0;
_prob = (_prob * pow((it3->second), (it->second)));
break;
} else {
_wordCounter++;
if (_wordCounter == prob._probCalc.size()) {
_prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
}
}
}
}
_prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
cout << _prob << endl;
_out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
_out.close();
return _prob;
}
#pragma一次
#包括
#包括
#包括
#包括
#包括
使用名称空间std;
NB类
{
公众:
NB(NB&cl1,NB&cl2,字符串类名称);
NB(字符串类名称);
NB(字符串类名称,int类类型);
向量组合ClassText();
void bagOfWords(字符串classCombine,bool isTotal=false);
无效calcProb(NB和总计);
浮动总概率(NB&prob,NB&total);
int类类型;
私人:
int _len=0;
浮动概率=1.0f;
int _voc=0;
int _nOfClass=0;
int_tnClass=0;
int _totalWordsinC=0;
int _wordCounter=0;
bool_isDone=假;
ifstream_in;
流出的液体;
//字符串组合;
字符串_类名称;
字符串_文件名;
向量_组合;
地图类;
地图弓;
地图(probCalc),;
};
NB.cpp文件:
#pragma once
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<map>
using namespace std;
class NB
{
public:
NB(NB& cl1, NB& cl2, string className);
NB(string className);
NB(string className, int classType);
vector <string> combineClassText();
void bagOfWords(string classCombine, bool isTotal = false);
void calcProb(NB& total);
float totalProb(NB& prob, NB& total);
int classType;
private:
int _len = 0;
float _prob = 1.0f;
int _voc = 0;
int _nOfClass = 0;
int _tnClass = 0;
int _totalWordsinC = 0;
int _wordCounter = 0;
bool _isDone = false;
ifstream _in;
ofstream _out;
//string _classCombine;
string _className;
string _fileName;
vector <string> _combined;
map<string, string> _category;
map<string, int> _bow;
map<string, float> _probCalc;
};
#include "NB.h"
#include<cmath>
NB::NB(NB& cl1, NB& cl2, string className)
{
_className = className;
_out.open("combineAll.txt");
if (_out.fail()) {
perror("cannot write to combineAll.txt");
}
_len = cl1.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl1.combineClassText()[i]);
}
_len = cl2.combineClassText().size();
for (int i = 0; i < _len; i++) {
_combined.push_back(cl2.combineClassText()[i]);
}
_len = _combined.size();
for (int i = 0; i < _len; i++) {
_out << _combined[i] << endl;
//cout << i + 1 << ". " << _combined[i] << endl;
}
_out.close();
_tnClass = cl1._tnClass + cl2._tnClass;
bagOfWords("combineAll.txt", true);
}
NB::NB(string className, int classType) {
NB::classType = classType;
_className = className;
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
_category[_fileName] = _className;
combineClassText();
bagOfWords(_className + ".txt");
}
NB::NB(string className)
{
_className = className;
while (_isDone == false) {
cout << "Enter a filename for " + _className << endl;
cin >> _fileName;
if (_fileName != "q") {
_category[_fileName] = _className;
_nOfClass++;
_tnClass++;
} else {
_isDone = true;
}
}
combineClassText();
bagOfWords(_className + ".txt");
}
vector<string> NB::combineClassText() {
string temp;
string classCombine = _className + ".txt";
vector <string> tmp;
map<string, string>::iterator it;
_out.open(classCombine);
if (_out.fail()) {
perror("cannot write to");
}
for (it = _category.begin(); it != _category.end(); it++) {
_in.open(it->first);
if (_in.fail()) {
perror("cannot read from");
}
while (_in >> temp) {
_out << temp << endl;
tmp.push_back(temp);
}
_in.close();
}
_out.close();
return tmp;
}
void NB::bagOfWords(string classCombine, bool isTotal) {
map<string, int>::iterator it;
string temp;
vector<string> tp;
string name = _className + "_bow.txt";
int len;
_in.open(classCombine);
if (_in.fail()) {
perror("cannot read from");
}
_out.open(name);
if (_out.fail()) {
perror("cannot write to");
}
while (_in >> temp) {
tp.push_back(temp);
}
for (int i = 0; i < tp.size(); i++) {
for (int j = 0; j < tp[i].size(); j++) {
if (tp[i][j] == '.' || tp[i][j] == ',') {
tp[i][j] = ' ';
}
}
}
len = tp.size();
vector<int> count(len, 1);
for (int i = 0; i < len; i++) {
for (int j = 0; j < (len - i - 1); j++) {
if (tp[i] == tp[j + i + 1]) {
count[i]++;
}
}
}
for (int i = len - 1; i >= 0; i--) {
_bow[tp[i]] = count[i];
}
for (it = _bow.begin(); it != _bow.end(); it++) {
_out << it->first << ": " << it->second << endl;
//cout << it->first << ": " << it->second << endl;
}
//cout << endl;
if (isTotal == true) {
for (it = _bow.begin(); it != _bow.end(); it++) {
_voc += 1;
//cout << _voc << endl;
}
} else {
for (it = _bow.begin(); it != _bow.end(); it++) {
_totalWordsinC += it->second;
}
//cout << _totalWordsinC << endl;
}
_in.close();
_out.close();
}
void NB::calcProb(NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
for (it = total._bow.begin(); it != total._bow.end(); it++) {
for (it2 = _bow.begin(); it2 != _bow.end(); it2++) {
if (it->first == it2->first) {
_probCalc[it->first] = (float)((it2->second) + 1) / (_totalWordsinC + total._voc);
break;
} else {
_probCalc[it->first] = (float)(1) / (_totalWordsinC + total._voc);
}
}
}
for (it3 = _probCalc.begin(); it3 != _probCalc.end(); it3++) {
//cout << it3->first << ": " << it3->second << endl;
_out << it3->first << ": " << it3->second << endl;
}
_out.close();
}
float NB::totalProb(NB& prob, NB& total) {
map<string, int> ::iterator it;
map<string, int> ::iterator it2;
map<string, float> ::iterator it3;
_out.open(_className + "_" + prob._className + "_prob.txt");
if (_out.fail()) {
perror("cannot write to");
}
_prob = 1.0f;
for (it = _bow.begin(); it != _bow.end(); it++) {
for (it3 = prob._probCalc.begin(); it3 != prob._probCalc.end(); it3++) {
if (it->first == it3->first) {
_wordCounter = 0;
_prob = (_prob * pow((it3->second), (it->second)));
break;
} else {
_wordCounter++;
if (_wordCounter == prob._probCalc.size()) {
_prob = _prob * ((float)1 / (prob._totalWordsinC + total._voc));
}
}
}
}
_prob = (_prob * ((float)(prob._nOfClass) / total._tnClass));
cout << _prob << endl;
_out << "The probalility of the " << _className << " beloning to " << prob._className << " is: " << _prob << endl;
_out.close();
return _prob;
}
#包括“NB.h”
#包括
NB::NB(NB&cl1,NB&cl2,字符串类名称)
{
_className=className;
_out.open(“combineAll.txt”);
如果(_out.fail()){
perror(“无法写入combineAll.txt”);
}
_len=cl1.combineClassText().size();
对于(int i=0;i<\u len;i++){
_combined.push_back(cl1.combineClassText()[i]);
}
_len=cl2.combineClassText().size();
对于(int i=0;i<\u len;i++){
_combined.push_back(cl2.combineClassText()[i]);
}
_len=_组合的.size();
对于(int i=0;i<\u len;i++){
你应该在CoDeVIEW中询问而不是StAdvOpLoad…<代码>我想知道代码是什么样子的?就像我编写代码的方式一样,它是一个很好的C++实践< /代码>。