Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/277.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
首先对模型进行训练,并进行多次测试 我一直尝试用python的NLP脚本和基于QTGUI的C++应用程序。 基本上,在应用程序中,我尝试通过命令行访问NLP脚本: QString path = "D:/DS Project/Treegramming"; QString command("py"); QStringList params = QStringList() << "nlp.py"; params << text; QProcess *process = new QProcess(); process->setWorkingDirectory(path); process->start(command, params); process->waitForFinished(); QString result = process->readAll();_Python_Command Line_Nlp - Fatal编程技术网

首先对模型进行训练,并进行多次测试 我一直尝试用python的NLP脚本和基于QTGUI的C++应用程序。 基本上,在应用程序中,我尝试通过命令行访问NLP脚本: QString path = "D:/DS Project/Treegramming"; QString command("py"); QStringList params = QStringList() << "nlp.py"; params << text; QProcess *process = new QProcess(); process->setWorkingDirectory(path); process->start(command, params); process->waitForFinished(); QString result = process->readAll();

首先对模型进行训练,并进行多次测试 我一直尝试用python的NLP脚本和基于QTGUI的C++应用程序。 基本上,在应用程序中,我尝试通过命令行访问NLP脚本: QString path = "D:/DS Project/Treegramming"; QString command("py"); QStringList params = QStringList() << "nlp.py"; params << text; QProcess *process = new QProcess(); process->setWorkingDirectory(path); process->start(command, params); process->waitForFinished(); QString result = process->readAll();,python,command-line,nlp,Python,Command Line,Nlp,您需要创建两个python脚本: 首先对NaiveBayesClassifier进行训练和保存 其次是加载和测试模型 为了防止代码重复,我将为有用的函数创建一个脚本,并将其称为utils.py,如下所示: import re import string from nltk.tag import pos_tag from nltk.stem.wordnet import WordNetLemmatizer def lemmatize_sentence(tokens): sentenc

您需要创建两个python脚本:

  • 首先对NaiveBayesClassifier进行训练保存
  • 其次是加载和测试模型
为了防止代码重复,我将为有用的函数创建一个脚本,并将其称为
utils.py
,如下所示:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );
然后让我们创建培训脚本,我将其命名为
train.py
,它应该是这样的:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );
最后,测试脚本
test.py
如下所示:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );

现在,运行一次
train.py
来训练朴素贝叶斯分类器,该分类器将创建一个名为
model.pickle的新文件,该文件保存经过训练的分类器。然后在您的自定义tWEET上从C++应用程序运行<代码> Test.Py <代码>。code>test.py
应该加载经过训练的模型
model.pickle
并在给定的自定义tweet上使用它。

您需要创建两个python脚本:

  • 首先对NaiveBayesClassifier进行训练保存
  • 其次是加载和测试模型
为了防止代码重复,我将为有用的函数创建一个脚本,并将其称为
utils.py
,如下所示:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );
然后让我们创建培训脚本,我将其命名为
train.py
,它应该是这样的:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );
最后,测试脚本
test.py
如下所示:

import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    sentence = []
    lematizer = WordNetLemmatizer()
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        sentence.append( lematizer.lemmatize( word , pos ) )
    return sentence

def remove_noise(tokens , stop_words = ()):
    sentence = []
    for token, tag in pos_tag( tokens ):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
        token = re.sub("(@[A-Za-z0-9_]+)","",token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            sentence.append( token.lower() )
    return sentence

def get_all_words(tokens_list):
    for tokens in tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(tokens_list):
    for tweets in tokens_list:
        yield dict([token,True] for token in tweets)


import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples


positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )

freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]

dataset = pos_dataset + neg_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
    pickle.dump(classifier, fout)
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize

from utils import remove_noise

#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
    classifier = pickle.load(fin)


def test( custom_tweet ):
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    res = classifier.classify(dict([token, True] for token in custom_tokens))
    print(res)
    f = open( "result.txt" , "w" )
    f.write(res)    
    f.close() 

eval( sys.argv[1] );
现在,运行一次
train.py
来训练朴素贝叶斯分类器,该分类器将创建一个名为
model.pickle的新文件,该文件保存经过训练的分类器。然后在您的自定义tWEET上从C++应用程序运行<代码> Test.Py <代码>。code>test.py
应该加载经过训练的模型
model.pickle
并在给定的自定义tweet上使用它