Python 改进文档分类方法

Python 改进文档分类方法,python,machine-learning,regression,libsvm,document-classification,Python,Machine Learning,Regression,Libsvm,Document Classification,我有一个程序来预测一篇新闻文章是否与某个主题有关 有两个主要脚本: 1) bow_train.py-生成一个单词列表和一个模型,并将它们存储在两个文件(arab.model和wordlist.pkl)中 2) bow_predict.py-使用单词列表和模型对未知文章进行分类 使用的方法是logistic回归而不是支持向量机,因为logistic回归对于此类分类的性能应该更好 我想改进结果。还有其他方法可以让你强调某些关键词吗。例如,对于“阿拉伯之春”主题,我会输入一个关键字列表:[“抗议”、“

我有一个程序来预测一篇新闻文章是否与某个主题有关

有两个主要脚本:

1) bow_train.py-生成一个单词列表和一个模型,并将它们存储在两个文件(arab.model和wordlist.pkl)中

2) bow_predict.py-使用单词列表和模型对未知文章进行分类

使用的方法是logistic回归而不是支持向量机,因为logistic回归对于此类分类的性能应该更好

我想改进结果。还有其他方法可以让你强调某些关键词吗。例如,对于“阿拉伯之春”主题,我会输入一个关键字列表:[“抗议”、“动乱”、“革命”等],有这些关键字的文件比没有这些关键字的文件概率更高

bow\u predict.py

import re
import os
import sys
import pickle
import operator

from collections import Counter

from liblinearutil import *

from bow_util import *

# path to directory with articles that should be classified
rootdirAll = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\all\\'

# load the wordList and model from the training phase
wordListIn = open('wordList.pkl', 'rb')

m        = load_model('arab.model')
wordList = pickle.load(wordListIn)

counterByFilepathAll = {}

# count and store term frequencies
for folder, subs, files in os.walk(rootdirAll):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathAll[filepath] = count_words(wordsInArticle)

denseData = []

# generate features from term frequencies (bag-of-words)
for _, counter in counterByFilepathAll.iteritems():
    denseData.append(gen_features(counter, wordList))

# assume output class is 1 (liblinear/libsvm always require a output class
# even for unknown data)
classList = [1 for _ in xrange(0, len(counterByFilepathAll))]

# predict using the model from training phase
y, x                  = classList, denseData
p_label, p_acc, p_val = predict(y, x, m)

# store probabilites by filepath
probByFilepath = {}
i = 0
for filepath, _ in counterByFilepathAll.iteritems():
    probByFilepath[filepath] = p_val[i]
    i += 1

# sort by probability
sortedByProb = sorted(probByFilepath.iteritems(),
                      key=operator.itemgetter(1),
                      reverse=True)

# write to output file         
probsOut = open('probsOut.txt', 'wb')
for t in sortedByProb:
    probsOut.write(' '.join(str(s) for s in t) + '\n')

probsOut.close()
bow\u train.py

import re
import os
import sys
import copy
import pickle

from collections import defaultdict
from collections import Counter

from liblinearutil import *

from bow_util import *

# Initialize directories for articles

rootdirArab = sys.argv[1]
rootdirNoArab = sys.argv[2]

#rootdirArab   = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\arab\\'
#rootdirNoArab = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\no_arab\\'

wordSet                 = set()
counterByFilepathArab   = {}
counterByFilepathNoArab = {}

# generate set of all words in all articles
for rootdir in [rootdirArab, rootdirNoArab]:
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            filepath       = os.path.join(folder, filename)
            wordsInArticle = get_words_from_file(filepath)
            wordSet        = wordSet.union(wordSet, wordsInArticle)

# store sorted set in list
wordList = sorted(wordSet)

# save sorted list to output file for prediction phase
wordListOut = open('wordList.pkl', 'wb')
pickle.dump(wordList, wordListOut)

# count and store term frequencies for all arab spring training articles
for folder, subs, files in os.walk(rootdirArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathArab[filepath] = count_words(wordsInArticle)

# count and store term frequencies for all non arab spring training articles
for folder, subs, files in os.walk(rootdirNoArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathNoArab[filepath] = count_words(wordsInArticle)

# generate features. the features for one article are a list of the frequenices 
# of each term in wordList found in the article
denseData = []

for counter in counterByFilepathArab.values():
    denseData.append(gen_features(counter, wordList))

for counter in counterByFilepathNoArab.values():
    denseData.append(gen_features(counter, wordList))

# set output value to 1 for arab spring articles and -1 for non arab spring articles
classList = [1 for _ in xrange(0, len(counterByFilepathArab))] + \
            [-1 for _ in xrange(0, len(counterByFilepathNoArab))]

# train logistic regression model
y, x  = classList, denseData
prob  = problem(y, x)
# uncomment to obtain cross validation results
#param = parameter('-v 5')
m     = train(prob)#, param)

# store model in output file for prediction phase
save_model('arab.model', m)

# uncomment to check if training worked as expected
#p_label, p_acc, p_val = predict(y, x, m)
#ACC, MSE, SCC         = evaluations(y, p_label)

wordListOut.close()

我不清楚你的问题到底是什么。所以对于特定的问题来说这是最好的。为什么你想人为地增加某些关键词?如果您真的想这样做,您可以尝试在Arab Spring测试集中添加“假”文档,其中包含您想要增强的术语。