Python 属性错误：'；LdaModel'；对象没有属性'；最小φ值'；_Python_Tensorflow_Nlp_Gensim_Topic Modeling

Python 属性错误：'；LdaModel'；对象没有属性'；最小φ值'；

python tensorflow nlp

Python 属性错误：'；LdaModel'；对象没有属性'；最小φ值'；,python,tensorflow,nlp,gensim,topic-modeling,Python,Tensorflow,Nlp,Gensim,Topic Modeling,因为我只是在尝试NLP，所以我正在进行讽刺检测，但同时我把这段代码放在了一起 Saracsmextractor.py # coding: utf-8 # Importing the library # In[2]: import io import sys import os import numpy as np import pandas as pd import nltk import gensim import csv, collections from textblob impor

因为我只是在尝试NLP，所以我正在进行讽刺检测，但同时我把这段代码放在了一起

Saracsmextractor.py

# coding: utf-8

# Importing the library

# In[2]:

import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji


# Define a class to load the SentimentWordnet and write methods to calculate the scores

# In[4]:

class load_senti_word_net(object):
    """
    constructor to load the file and read the file as CSV
    6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
    synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
    """

    def __init__(self):
        sent_scores = collections.defaultdict(list)
        with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
            file_content = csv.reader(fname, delimiter='\t',quotechar='"')

            for line in file_content:                
                if line[0].startswith('#') :
                    continue                    
                pos, ID, PosScore, NegScore, synsetTerms, gloss = line
                for terms in synsetTerms.split(" "):
                    term = terms.split("#")[0]
                    term = term.replace("-","").replace("_","")
                    key = "%s/%s"%(pos,term.split("#")[0])
                    try:
                        sent_scores[key].append((float(PosScore),float(NegScore)))
                    except:
                        sent_scores[key].append((0,0))

        for key, value in sent_scores.items():
            sent_scores[key] = np.mean(value,axis=0)

        self.sent_scores = sent_scores    

    """
    For a word,
    nltk.pos_tag(["Suraj"])
    [('Suraj', 'NN')]
    """

    def score_word(self, word):
        pos = nltk.pos_tag([word])[0][1]
        return self.score(word, pos)

    def score(self,word, pos):
        """
        Identify the type of POS, get the score from the senti_scores and return the score
        """

        if pos[0:2] == 'NN':
            pos_type = 'n'
        elif pos[0:2] == 'JJ':
            pos_type = 'a'
        elif pos[0:2] =='VB':
            pos_type='v'
        elif pos[0:2] =='RB':
            pos_type = 'r'
        else:
            pos_type =  0

        if pos_type != 0 :    
            loc = pos_type+'/'+word
            score = self.sent_scores[loc]
            if len(score)>1:
                return score
            else:
                return np.array([0.0,0.0])
        else:
            return np.array([0.0,0.0])

    """
    Repeat the same for a sentence
    nltk.pos_tag(word_tokenize("My name is Suraj"))
    [('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]    
    """    

    def score_sentencce(self, sentence):
        pos = nltk.pos_tag(sentence)
        print (pos)
        mean_score = np.array([0.0, 0.0])
        for i in range(len(pos)):
            mean_score += self.score(pos[i][0], pos[i][1])

        return mean_score

    def pos_vector(self, sentence):
        pos_tag = nltk.pos_tag(sentence)
        vector = np.zeros(4)

        for i in range(0, len(pos_tag)):
            pos = pos_tag[i][1]
            if pos[0:2]=='NN':
                vector[0] += 1
            elif pos[0:2] =='JJ':
                vector[1] += 1
            elif pos[0:2] =='VB':
                vector[2] += 1
            elif pos[0:2] == 'RB':
                vector[3] += 1

        return vector



# Now let's extract the features
# 
# ###Stemming and Lemmatization

# In[5]:

porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()


# In[7]:

def gram_features(features,sentence):
    sentence_rep = replace_emoji.replace_reg(str(sentence))
    token = nltk.word_tokenize(sentence_rep)
    token = [porter.stem(i.lower()) for i in token]        

    bigrams = nltk.bigrams(token)
    bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
    grams = token + bigrams
    #print (grams)
    for t in grams:
        features['contains(%s)'%t]=1.0



# In[8]:

import string
def sentiment_extract(features, sentence):
    sentence_rep = replace_emoji.replace_reg(sentence)
    token = nltk.word_tokenize(sentence_rep)    
    token = [porter.stem(i.lower()) for i in token]   
    mean_sentiment = sentiments.score_sentencce(token)
    features["Positive Sentiment"] = mean_sentiment[0]
    features["Negative Sentiment"] = mean_sentiment[1]
    features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
    #print(mean_sentiment[0], mean_sentiment[1])

    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
        features["Blob Polarity"] = text.sentiment.polarity
        features["Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["Blob Polarity"] = 0
        features["Blob Subjectivity"] = 0
        print("do nothing")


    first_half = token[0:int(len(token)/2)]    
    mean_sentiment_half = sentiments.score_sentencce(first_half)
    features["positive Sentiment first half"] = mean_sentiment_half[0]
    features["negative Sentiment first half"] = mean_sentiment_half[1]
    features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
        features["first half Blob Polarity"] = text.sentiment.polarity
        features["first half Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["first Blob Polarity"] = 0
        features["first Blob Subjectivity"] = 0
        print("do nothing")

    second_half = token[int(len(token)/2):]
    mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
    features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
    features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
    features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
    try:
        text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
        features["second half Blob Polarity"] = text.sentiment.polarity
        features["second half Blob Subjectivity"] = text.sentiment.subjectivity
        #print (text.sentiment.polarity,text.sentiment.subjectivity )
    except:
        features["second Blob Polarity"] = 0
        features["second Blob Subjectivity"] = 0
        print("do nothing")  





# In[9]:

features = {}
sentiment_extract(features,"a long narrow opening")


# In[11]:

def pos_features(features,sentence):
    sentence_rep = replace_emoji.replace_reg(sentence)
    token = nltk.word_tokenize(sentence_rep)
    token = [ porter.stem(each.lower()) for each in token]
    pos_vector = sentiments.pos_vector(token)
    for j in range(len(pos_vector)):
        features['POS_'+str(j+1)] = pos_vector[j]
    print ("done")



# In[12]:

features = {}
pos_features(features,"a long narrow opening")


# In[13]:

def capitalization(features,sentence):
    count = 0
    for i in range(len(sentence)):
        count += int(sentence[i].isupper())
    features['Capitalization'] = int(count > 3)
    print (count)


# In[14]:

features = {}
capitalization(features,"A LoNg NArrow opening")


# In[15]:

import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')


# In[16]:

topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))


# In[17]:

def topic_feature(features,sentence,topic_modeler):    
    topics = topic_modeler.transform(sentence)    
    for j in range(len(topics)):
        features['Topic :'] = topics[j][1]



# In[18]:

topic_feature(features,"A LoNg NArrow opening",topic_mod)


# In[19]:

def get_features(sentence, topic_modeler):
    features = {}
    gram_features(features,sentence)
    pos_features(features,sentence)
    sentiment_extract(features, sentence)
    capitalization(features,sentence)
    topic_feature(features, sentence,topic_modeler)
    return features


# In[20]:

df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()


# In[17]:

import re

for i in range(0,df.size):
    temp = str(df["tweets"][i])
    temp = re.sub(r'[^\x00-\x7F]+','',temp)
    featureset.append((get_features(temp,topic_mod), df["label"][i]))


# In[20]:

c = []
for i in range(0,len(featureset)):
    c.append(pd.DataFrame(featureset[i][0],index=[i]))

result = pd.concat(c)


# In[22]:

result.insert(loc=0,column="label",value='0')


# In[23]:

for i in range(0, len(featureset)):
    result["label"].loc[i] = featureset[i][1]   



# In[25]:

result.to_csv('feature_dataset.csv')


# In[3]:

df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()


# In[4]:

get_ipython().magic('matplotlib inline')

import matplotlib as matplot 
import seaborn

result = df


# In[5]:

X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values


# In[6]:

Y = result['label']


# In[7]:

import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm


# In[29]:

model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
         "RandomForest":ek.RandomForestClassifier(n_estimators=50),
         "Adaboost":ek.AdaBoostClassifier(n_estimators=50),
         "GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
         "GNB":GaussianNB(),
         "Logistic Regression":LinearRegression()   
}


# In[8]:

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)


# In[9]:

X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())

X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())


# In[38]:

results_algo = {}
for algo in model:
    clf = model[algo]
    clf.fit(X_train,y_train.astype(int))
    score = clf.score(X_test,y_test.astype(int))
    print ("%s : %s " %(algo, score))
    results_algo[algo] = score



# In[39]:

winner = max(results_algo, key=results_algo.get)


# In[40]:

clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))


# In[41]:

from sklearn import metrics
print (metrics.classification_report(y_test, res))


# In[34]:

test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"


# In[101]:

test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."


# In[82]:

test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"


# In[102]:

test_feature = []
test_feature.append((get_features(test_data,topic_mod)))


# In[104]:

test_feature


# In[105]:

c = []

c.append(pd.DataFrame(test_feature[0],index=[i]))

test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values


# In[106]:

res= clf.predict(test_result)

但它给了我以下错误：

C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
  File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
    topic_feature(features,"A LoNg NArrow opening",topic_mod)
  File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
    topics = topic_modeler.transform(sentence)    
  File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
    return self.lda[corpus_sentence]     
  File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
    return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'

在这里可以找到整个代码。

最小φ值

是LdaModel
的一个属性，它是在创建实例时设置的，由于某种原因它没有被序列化（这很奇怪，可能是一个bug）
要解决此特定问题，您可以添加
self.lda.minimum_phi_value = 0.01

。。。在self.lda
加载后，或尽可能避免保存/恢复模型（即始终对其进行训练）
但是我鼓励您在序列化前后检查self.lda
的字段，以检查它们是否相同。
topic.py

的源代码？@Maxim非常感谢我为此编写了代码。我将检查并回答您lda中“每字主题”的默认值是多少

self.lda.minimum_phi_value = 0.01