Python Scikit learn/pandas-使用机器学习预测用户输入的文本(以xlsx表示)

Python Scikit learn/pandas-使用机器学习预测用户输入的文本(以xlsx表示),python,pandas,scikit-learn,Python,Pandas,Scikit Learn,我有Xlsx文件,预定义的文本只有一列。用户将输入一个或多个单词,输出将是包含一个或多个单词的文本 import numpy as np import pandas as pd import time import re from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer from sklearn.metrics.pairwise import linear_

我有Xlsx文件,预定义的文本只有一列。用户将输入一个或多个单词,输出将是包含一个或多个单词的文本

import numpy as np
import pandas as pd
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
import pickle


def load_df(path):
    df = pd.read_excel(path)
    print(df.shape)
    return df


def splitDataFrameList(df, target_column, separator):
    def splitListToRows(row, row_accumulator, target_column, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    new_rows = []
    df.apply(splitListToRows, axis=1, args=(new_rows, target_column, separator))
    new_df = pd.DataFrame(new_rows)
    return new_df


class Autocompleter:
    def __init__(self):
        pass

    def import_json(self, json_filename):
        print("load Excel file...")
        df = load_df(json_filename)
        return df

    def process_data(self, new_df):

        # print("select representative threads...")
        # new_df = new_df[new_df.IsFromCustomer == False]

        print("split sentenses on punctuation...")
        for sep in ['. ', ', ', '? ', '! ', '; ']:
            new_df = splitDataFrameList(new_df, 'UserSays', sep)

        print("UserSays Cleaning using simple regex...")
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.strip("."))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.strip("-"))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
        # new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.lower())
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' i ', ' I '))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' ?', '?'))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' !', '!'))
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' .', '.'))
        # new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace('OK', 'Ok'))
        # new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x[0].upper() + x[1:])
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x + "?" if re.search(r'^(Wh|How).+([^?])$', x) else x)
        new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.capitalize())

        print("calculate nb words of sentenses...")
        new_df['nb_words'] = new_df['UserSays'].apply(lambda x: len(str(x).split(' ')))
        new_df = new_df[new_df['nb_words'] > 2]

        print("count occurence of sentenses...")
        new_df['Counts'] = new_df.groupby(['UserSays'])['UserSays'].transform('count')

        print("remove duplicates (keep last)...")
        new_df = new_df.drop_duplicates(subset=['UserSays'], keep='last')

        new_df = new_df.reset_index(drop=True)
        print(new_df.shape)

        return new_df

    def calc_matrice(self, df):
        # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
        model_tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
        tfidf_matrice = model_tf.fit_transform(df['UserSays'])
        pickle.dump(model_tf, open("model.pkl", 'wb'))
        pickle.dump(tfidf_matrice, open("train.pkl", 'wb'))
        print("tfidf_matrice ", tfidf_matrice.shape)
        return model_tf, tfidf_matrice

    def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):

        prefix_string = str(prefix_string)
        new_df = data.reset_index(drop=True)
        weights = new_df['Counts'].apply(lambda x: 1 + np.log1p(x)).values

        # tranform the string using the tfidf model
        tfidf_matrice_spelling = model_tf.transform([prefix_string])
        # calculate cosine_matrix
        cosine_similarite = cosine_similarity(tfidf_matrice, tfidf_matrice_spelling)

        # sort by order of similarity from 1 to 0:
        similarity_scores = list(enumerate(cosine_similarite))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:10]

        similarity_scores = [i for i in similarity_scores]
        similarity_indices = [i[0] for i in similarity_scores]

        # add weight to the potential results that had high frequency in orig data
        for i in range(len(similarity_scores)):
            similarity_scores[i][1][0] = similarity_scores[i][1][0] * weights[similarity_indices][i]

        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:]
        similarity_indices_w = [i[0] for i in similarity_scores]
        final_result = new_df.loc[similarity_indices_w]['UserSays'].tolist()
        return final_result
在输入中,如果我不输入任何内容,它将为我提供此输出

['How to access outlook on open network?', 'Email access outside ril network', 'Log in outlook away from office']
这是不可取的 如果只有一个文本匹配,它将给出以下输出

input - sccm
['What is sccm', 'How to access outlook on open network?', 'Email access outside ril network']
我希望输出的方式,如果输入的字或字不存在于xlsx文件中,那么输出不应该返回我任何东西。
我认为,您的代码返回的值的相似性分数为0。您可以更改
generate_completions
函数中的行,以仅保留相似度得分大于零的值:

similarity_scores = [i for i in similarity_scores if i[1] > 0]

看起来您希望在返回输出之前对相似性分数应用某种阈值。现在返回的值具有余弦相似性=0@kev8484是的,更正更改代码'similarity_scores=[i for i in similarity_scores if i>0]后,我收到以下错误。TypeError:'>'在'tuple'和'int'的实例之间不受支持。'i在某一点之前获得所需的输出。但对于少数输入,它会给出空白输出。但是谢谢你的建议