如何调整NLTK Python代码,使我只训练分类器一次

如何调整NLTK Python代码,使我只训练分类器一次,python,nltk,sentiment-analysis,Python,Nltk,Sentiment Analysis,我试着在一个大约10000句的庞大数据集上进行情绪分析。现在,当我使用NLTK Python代码使用朴素贝叶斯进行训练和测试时,每次需要对一组新句子进行分类时,我都要训练分类器。这需要花费很多时间。是否有一种方法可以获取培训部分的输出,然后将其用于分类,从而节省大量时间。这是我使用的NLTK代码 import nltk import re import csv #Read the tweets one by one and process it def processTweet(tweet

我试着在一个大约10000句的庞大数据集上进行情绪分析。现在,当我使用NLTK Python代码使用朴素贝叶斯进行训练和测试时,每次需要对一组新句子进行分类时,我都要训练分类器。这需要花费很多时间。是否有一种方法可以获取培训部分的输出,然后将其用于分类,从而节省大量时间。这是我使用的NLTK代码

import nltk
import re
import csv
#Read the tweets one by one and process it



def processTweet(tweet):
    # process the tweets
    #convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

#start getStopWordList
def getStopWordList(stopWordListFileName):
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('url')
    stopWords.append('URL')
    stopWords.append('rt')

    fp = open(stopWordListFileName)
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords
#end

#start getfeatureVector
def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word starts with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)

        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector
#end

def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

inpTweets = csv.reader(open('sheet3.csv', 'rb'), delimiter=',')
stopWords = getStopWordList('stopwords.txt')
featureList = []



# Get tweet words
tweets = []
for row in inpTweets:
    sentiment = row[0]
    tweet = row[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment));
#end loop

# Remove featureList duplicates
featureList = list(set(featureList))

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)

NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

ft = open("april2.tsv")
line = ft.readline()

fo = open("dunno.tsv", "w")

fo.seek(0,0)
while line:
    testTweet = line
    processedTestTweet = processTweet(testTweet)
    line1 = fo.write( NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet))) + "\n");
    line = ft.readline()

fo.close()
ft.close()

NLTK中的朴素贝叶斯分类器模块速度惊人,因为它是纯Python实现。因此,考虑使用不同的机器学习(ML)库,例如:


YS-L的提示对使用cPickle很有帮助,但目前,如果您需要重新训练分类器,最好切换到不同的朴素贝叶斯实现

如果您想坚持使用NLTK,请尝试
pickle
,例如,请参阅:


否则,请尝试其他机器学习库,如sklearn或

您是否尝试过清理分类器对象?可能会有帮助。非常感谢,。这有帮助!
#-*- coding: utf8 -*-

from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from cPickle import dump,load

def loadtagger(taggerfilename):
    infile = open(taggerfilename,'rb')
    tagger = load(infile); infile.close()
    return tagger

def traintag(corpusname, corpus):
    # Function to save tagger.
    def savetagger(tagfilename,tagger):
        outfile = open(tagfilename, 'wb')
        dump(tagger,outfile,-1); outfile.close()
        return
    # Training UnigramTagger.
    uni_tag = ut(corpus)
    savetagger(corpusname+'_unigram.tagger',uni_tag)
    # Training BigramTagger.
    bi_tag = bt(corpus)
    savetagger(corpusname+'_bigram.tagger',bi_tag)
    print "Tagger trained with",corpusname,"using" +\
                "UnigramTagger and BigramTagger."
    return