Python NLTK朴素贝叶斯分类器在正确特征选择中的应用_Python_Nltk

Python NLTK朴素贝叶斯分类器在正确特征选择中的应用

python

Python NLTK朴素贝叶斯分类器在正确特征选择中的应用,python,nltk,Python,Nltk,我一直在重复使用下面的代码，但是对于我的输出“信息量最大的功能”，我得到了错误的标签功能。你认为这是我自制语料库中的数据编码问题吗 import csv import nltk from nltk.classify.util import apply_features from nltk.corpus import stopwords import math import re import sys import os import codecs reload(sys) sys.setdefau

我一直在重复使用下面的代码，但是对于我的输出“信息量最大的功能”，我得到了错误的标签功能。你认为这是我自制语料库中的数据编码问题吗

import csv
import nltk
from nltk.classify.util import apply_features
from nltk.corpus import stopwords
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')

customstopwords = ['show', 'they', 'them','He','She','We','i','are','this','the','so','to','me','for','and','was','in','as','about']

#Loads the sentiment files
p = open('Positivetweets50.txt', 'r')
postxt = p.readlines()

n = open('Negativetweets50.txt', 'r')
negtxt = n.readlines()

neglist = []
poslist = []

#creates a list of sentiment files with the same length of the sentiment tweet list.

for i in range(0,len(negtxt)):
    neglist.append('negative')


for i in range(0,len(postxt)):
    poslist.append('positive')

#creates a tuple list with sentiment tagged at the end of sentences.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)

#appends all the tagged tweets to a common list
taggedtweets = postagged + negtagged

print taggedtweets 

tweets = []

#creates a list of words with sentiments.
for (word, sentiment) in taggedtweets:
    word_filter = [i.lower() for i in word.split()]
    tweets.append((word_filter, sentiment))

#Pulls out all the words in a list of tagged tweets.
def getwords(tweets):
    allwords = []
    for (words, sentiment) in tweets:
        allwords.extend(words)
    return allwords

#uses nltk library to order the list of tweets words pulled out by their frequency.
def getwordfeatures(listoftweets):
    wordfreq = nltk.FreqDist(listoftweets)
    words = wordfreq.keys()
    return words    

#calls the baove functions to provide the list of words excluding the custom and stop words, ordered by frequency

print getwordfeatures(getwords(tweets))

wordlist = getwordfeatures(getwords(tweets))

def feature_extractor(doc):
    docwords = set(doc)
    features = {}
    for i in wordlist:
        features['contains(%s)' % i] = (i in docwords)
    return features

#creates the training set to classify on the basis of distribution of true and false in the input.
training_set = nltk.classify.util.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set) 

print classifier.show_most_informative_features(n=1000)
print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set)

输出：

Most Informative Features
           contains(tom) = True           negati : positi =      1.0 : 1.0
        contains(thrown) = True           negati : positi =      1.0 : 1.0
     contains("""joined) = True           negati : positi =      1.0 : 1.0
         contains(tokyo) = True           negati : positi =      1.0 : 1.0
 contains(@christophery) = True           negati : positi =      1.0 : 1.0
         contains(won't) = True           negati : positi =      1.0 : 1.0
contains("""@edisonneil) = True           negati : positi =      1.0 : 1.0
     contains(husband's) = True           negati : positi =      1.0 : 1.0
        contains(come!!) = True           negati : positi =      1.0 : 1.0
       contains(hair!!!) = True           negati : positi =      1.0 : 1.0
    contains(accountant) = True           negati : positi =      1.0 : 1.0
       contains(giggles) = True           negati : positi =      1.0 : 1.0
        contains(bigger) = True           negati : positi =      1.0 : 1.0
         contains(that?) = True           negati : positi =      1.0 : 1.0
        contains(they'd) = True           negati : positi =      1.0 : 1.0
 contains("""@jerinelay) = True           negati : positi =      1.0 : 1.0
      contains(launched) = True           negati : positi =      1.0 : 1.0
          contains(nina) = True           negati : positi =      1.0 : 1.0
           contains(htc) = True           negati : positi =      1.0 : 1.0
         contains(hmmmm) = True           negati : positi =      1.0 : 1.0
   contains("""@chele76) = True           negati : positi =      1.0 : 1.0
        contains(buying) = True           negati : positi =      1.0 : 1.0
       contains(teaches) = True           negati : positi =      1.0 : 1.0
        contains(heaven) = True           negati : positi =      1.0 : 1.0
          contains(old!) = True           negati : positi =      1.0 : 1.0
      contains(flipping) = True           negati : positi =      1.0 : 1.0
           contains(cal) = True           negati : positi =      1.0 : 1.0
     contains(roosevelt) = True           negati : positi =      1.0 : 1.0
           contains(wat) = True           negati : positi =      1.0 : 1.0
         contains(tribe) = True           negati : positi =      1.0 : 1.0
           contains(be!) = True           negati : positi =      1.0 : 1.0
    contains("""amazing) = True           negati : positi =      1.0 : 1.0
        contains(stairs) = True           negati : positi =      1.0 : 1.0
      contains(podcasts) = True           negati : positi =      1.0 : 1.0
         contains(pound) = True           negati : positi =      1.0 : 1.0
   contains(tomorrow...) = True           negati : positi =      1.0 : 1.0
       contains(months!) = True           negati : positi =      1.0 : 1.0
          contains(wana) = True           negati : positi =      1.0 : 1.0
        contains(impact) = True           negati : positi =      1.0 : 1.0
        contains(texted) = True           negati : positi =      1.0 : 1.0
       contains(vampire) = True           negati : positi =      1.0 : 1.0
contains("""@dionrodrigues) = True           negati : positi =      1.0 : 1.0
          contains(kind) = True           negati : positi =      1.0 : 1.0
       contains(sheesh.) = True           negati : positi =      1.0 : 1.0
     contains(pictures.) = True           negati : positi =      1.0 : 1.0
        contains(breeze) = True           negati : positi =      1.0 : 1.0
    contains(@amrosario) = True           negati : positi =      1.0 : 1.0
        contains(wells.) = True           negati : positi =      1.0 : 1.0
          contains(gave) = True           negati : positi =      1.0 : 1.0
         contains(soul.) = True           negati : positi =      1.0 : 1.0
          contains(addy) = True           negati : positi =      1.0 : 1.0
       contains(soooooo) = True           negati : positi =      1.0 : 1.0
        contains("""@j") = True           negati : positi =      1.0 : 1.0
           contains(coz) = True           negati : positi =      1.0 : 1.0
         contains(quick) = True           negati : positi =      1.0 : 1.0
          contains(did.) = True           negati : positi =      1.0 : 1.0
        contains(humor.) = True           negati : positi =      1.0 : 1.0
       contains(@b_club) = True           negati : positi =      1.0 : 1.0
contains("""@julieunplugged) = True           negati : positi =      1.0 : 1.0
          contains(fire) = True           negati : positi =      1.0 : 1.0
       contains(@angusi) = True           negati : positi =      1.0 : 1.0
          contains(bff.) = True           negati : positi =      1.0 : 1.0
         contains(page.) = True           negati : positi =      1.0 : 1.0
       contains(took""") = True           negati : positi =      1.0 : 1.0
      contains(returned) = True           negati : positi =      1.0 : 1.0
        contains(hello!) = True           negati : positi =      1.0 : 1.0
    contains(friday!!!!) = True           negati : positi =      1.0 : 1.0
     contains(creepy""") = True           negati : positi =      1.0 : 1.0
   contains(farewell""") = True           negati : positi =      1.0 : 1.0
     contains(awsome""") = True           negati : positi =      1.0 : 1.0
        contains(late..) = True           negati : positi =      1.0 : 1.0
   contains(@calmbanana) = True           negati : positi =      1.0 : 1.0
          contains(huge) = True           negati : positi =      1.0 : 1.0
        contains(window) = True           negati : positi =      1.0 : 1.0
      contains(complete) = True           negati : positi =      1.0 : 1.0
     contains(question?) = True           negati : positi =      1.0 : 1.0
       contains(from""") = True           negati : positi =      1.0 : 1.0
       contains("""baby) = True           negati : positi =      1.0 : 1.0
        contains(right.) = True           negati : positi =      1.0 : 1.0
     contains(delicious) = True           negati : positi =      1.0 : 1.0
     contains(unreal""") = True           negati : positi =      1.0 : 1.0
         contains(voted) = True           negati : positi =      1.0 : 1.0
        contains(@bk_ii) = True           negati : positi =      1.0 : 1.0
contains(@coolcatteacher) = True           negati : positi =      1.0 : 1.0
    contains(assessment) = True           negati : positi =      1.0 : 1.0
     contains(malaysian) = True           negati : positi =      1.0 : 1.0
     contains(french""") = True           negati : positi =      1.0 : 1.0
     contains(definitly) = True           negati : positi =      1.0 : 1.0
    contains("""@tvorse) = True           negati : positi =      1.0 : 1.0
  contains(m&amp""""""") = True           negati : positi =      1.0 : 1.0
contains("""@lewisstanson) = True           negati : positi =      1.0 : 1.0
       contains(warm""") = True           negati : positi =      1.0 : 1.0
   contains(@chrishealy) = True           negati : positi =      1.0 : 1.0
        contains(@_dznr) = True           negati : positi =      1.0 : 1.0
  contains(@awesomekong) = True           negati : positi =      1.0 : 1.0
        contains(broken) = True           negati : positi =      1.0 : 1.0
          contains(get!) = True           negati : positi =      1.0 : 1.0
          contains(some) = True           negati : positi =      1.0 : 1.0
       contains(friends) = True           negati : positi =      1.0 : 1.0
       contains(ipod""") = True           negati : positi =      1.0 : 1.0
contains("""@jlsofficial) = True           negati : positi =      1.0 : 1.0
       contains(@dayngr) = True           negati : positi =      1.0 : 1.0
     contains("""headed) = True           negati : positi =      1.0 : 1.0
           contains(:-p) = True           negati : positi =      1.0 : 1.0
None
accuracy: 1.0

语料库：和

任何帮助都将不胜感激

试试看：

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain

training_data = [('I love this sandwich.', 'pos'),
('This is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('This is my best work.', 'pos'),
("What an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff.', 'neg'),
("I can't deal with this", 'neg'),
('He is my sworn enemy!', 'neg'),
('My boss is horrible.', 'neg')]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)

test_sentence = "This is the best band I've ever heard!"
featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}

print "test_sent:",test_sentence
print "tag:",classifier.classify(featurized_test_sentence)

可能重复的