Python 改进文档分类方法_Python_Machine Learning_Regression_Libsvm_Document Classification

Python 改进文档分类方法

python machine-learning

Python 改进文档分类方法,python,machine-learning,regression,libsvm,document-classification,Python,Machine Learning,Regression,Libsvm,Document Classification,我有一个程序来预测一篇新闻文章是否与某个主题有关有两个主要脚本： 1） bow_train.py-生成一个单词列表和一个模型，并将它们存储在两个文件（arab.model和wordlist.pkl）中 2） bow_predict.py-使用单词列表和模型对未知文章进行分类使用的方法是logistic回归而不是支持向量机，因为logistic回归对于此类分类的性能应该更好我想改进结果。还有其他方法可以让你强调某些关键词吗。例如，对于“阿拉伯之春”主题，我会输入一个关键字列表：[“抗议”、“

我有一个程序来预测一篇新闻文章是否与某个主题有关

有两个主要脚本：

1） bow_train.py-生成一个单词列表和一个模型，并将它们存储在两个文件（arab.model和wordlist.pkl）中

2） bow_predict.py-使用单词列表和模型对未知文章进行分类

使用的方法是logistic回归而不是支持向量机，因为logistic回归对于此类分类的性能应该更好

我想改进结果。还有其他方法可以让你强调某些关键词吗。例如，对于“阿拉伯之春”主题，我会输入一个关键字列表：[“抗议”、“动乱”、“革命”等]，有这些关键字的文件比没有这些关键字的文件概率更高

bow\u predict.py

import re
import os
import sys
import pickle
import operator

from collections import Counter

from liblinearutil import *

from bow_util import *

# path to directory with articles that should be classified
rootdirAll = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\all\\'

# load the wordList and model from the training phase
wordListIn = open('wordList.pkl', 'rb')

m        = load_model('arab.model')
wordList = pickle.load(wordListIn)

counterByFilepathAll = {}

# count and store term frequencies
for folder, subs, files in os.walk(rootdirAll):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathAll[filepath] = count_words(wordsInArticle)

denseData = []

# generate features from term frequencies (bag-of-words)
for _, counter in counterByFilepathAll.iteritems():
    denseData.append(gen_features(counter, wordList))

# assume output class is 1 (liblinear/libsvm always require a output class
# even for unknown data)
classList = [1 for _ in xrange(0, len(counterByFilepathAll))]

# predict using the model from training phase
y, x                  = classList, denseData
p_label, p_acc, p_val = predict(y, x, m)

# store probabilites by filepath
probByFilepath = {}
i = 0
for filepath, _ in counterByFilepathAll.iteritems():
    probByFilepath[filepath] = p_val[i]
    i += 1

# sort by probability
sortedByProb = sorted(probByFilepath.iteritems(),
                      key=operator.itemgetter(1),
                      reverse=True)

# write to output file         
probsOut = open('probsOut.txt', 'wb')
for t in sortedByProb:
    probsOut.write(' '.join(str(s) for s in t) + '\n')

probsOut.close()

bow\u train.py

import re
import os
import sys
import copy
import pickle

from collections import defaultdict
from collections import Counter

from liblinearutil import *

from bow_util import *

# Initialize directories for articles

rootdirArab = sys.argv[1]
rootdirNoArab = sys.argv[2]

#rootdirArab   = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\arab\\'
#rootdirNoArab = 'C:\\Users\\Jiyda\\Desktop\\bow_arab\\no_arab\\'

wordSet                 = set()
counterByFilepathArab   = {}
counterByFilepathNoArab = {}

# generate set of all words in all articles
for rootdir in [rootdirArab, rootdirNoArab]:
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            filepath       = os.path.join(folder, filename)
            wordsInArticle = get_words_from_file(filepath)
            wordSet        = wordSet.union(wordSet, wordsInArticle)

# store sorted set in list
wordList = sorted(wordSet)

# save sorted list to output file for prediction phase
wordListOut = open('wordList.pkl', 'wb')
pickle.dump(wordList, wordListOut)

# count and store term frequencies for all arab spring training articles
for folder, subs, files in os.walk(rootdirArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathArab[filepath] = count_words(wordsInArticle)

# count and store term frequencies for all non arab spring training articles
for folder, subs, files in os.walk(rootdirNoArab):
    for filename in files:
        filepath       = os.path.join(folder, filename)
        wordsInArticle = get_words_from_file(filepath)
        counterByFilepathNoArab[filepath] = count_words(wordsInArticle)

# generate features. the features for one article are a list of the frequenices 
# of each term in wordList found in the article
denseData = []

for counter in counterByFilepathArab.values():
    denseData.append(gen_features(counter, wordList))

for counter in counterByFilepathNoArab.values():
    denseData.append(gen_features(counter, wordList))

# set output value to 1 for arab spring articles and -1 for non arab spring articles
classList = [1 for _ in xrange(0, len(counterByFilepathArab))] + \
            [-1 for _ in xrange(0, len(counterByFilepathNoArab))]

# train logistic regression model
y, x  = classList, denseData
prob  = problem(y, x)
# uncomment to obtain cross validation results
#param = parameter('-v 5')
m     = train(prob)#, param)

# store model in output file for prediction phase
save_model('arab.model', m)

# uncomment to check if training worked as expected
#p_label, p_acc, p_val = predict(y, x, m)
#ACC, MSE, SCC         = evaluations(y, p_label)

wordListOut.close()

我不清楚你的问题到底是什么。所以对于特定的问题来说这是最好的。为什么你想人为地增加某些关键词？如果您真的想这样做，您可以尝试在Arab Spring测试集中添加“假”文档，其中包含您想要增强的术语。