Python 从头开始实施tf idf以返回前n个特征w.r.t前n个idf值_Python_Scikit Learn_Tf Idf_Tfidfvectorizer

Python 从头开始实施tf idf以返回前n个特征w.r.t前n个idf值

python scikit-learn

Python 从头开始实施tf idf以返回前n个特征w.r.t前n个idf值,python,scikit-learn,tf-idf,tfidfvectorizer,Python,Scikit Learn,Tf Idf,Tfidfvectorizer,我目前正在学习TF-IDF矢量器，并试图从头开始实现TD-IDF，以便更好地理解。我创建了下面的类来实现TF-IDF矢量器下面的类基本上有两个主要方法。fit（）方法从文档语料库中找出唯一的单词，并计算这些元素的相关IDF值。transform（）方法为语料库中的文档生成td idf矢量器 from collections import Counter from tqdm import tqdm from scipy.sparse import csr_matrix import math i

我目前正在学习TF-IDF矢量器，并试图从头开始实现TD-IDF，以便更好地理解。我创建了下面的类来实现TF-IDF矢量器

下面的类基本上有两个主要方法。fit（）方法从文档语料库中找出唯一的单词，并计算这些元素的相关IDF值。transform（）方法为语料库中的文档生成td idf矢量器

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np


class TFIDFVectoriser_:
    
    # class initialisation
    def __init__(self): 
        """This is the initialisation method of the TFIDFVectoriser class"""
        self.__uniqueSet = set() # Set to store the unique words form the document corpus
        self.__uniqueVocab = {} # Dict to store the unique words form the document corpus as keys and column nunber as values
        self.__rows = [] # For storing index of the document 
        self.__columns =[] # For storing the dimensions of the word
        self.__values = [] # For storing the freqeny of the word
    
    
    #Fit method
    def fit(self,dataset):
        """This method will return the set of unique words from the input list and will map them with a column number"""
        if isinstance(dataset,list):
            for data in tqdm(dataset): # accessing each row in the dataset
                for w in data.split(" "): # accessing each word in each row
                    if len(w) < 2: # excluding words/stopwords which are mostly of lenght less than 2
                        continue
                    self.__uniqueSet.add(w)
            self.__uniqueSet = sorted(list(self.__uniqueSet)) # storing the final set of unique words in a srted list
            self.__uniqueVocab = {j:i for i,j in enumerate(self.__uniqueSet)} # creating a dict() of unique elements and column numbers           
            __idfValues = self.__IDF(dataset,self.__uniqueVocab) # calling function to generate the IDF values
            
            return self.__uniqueVocab, __idfValues #returning unique words and theie idf values 
        else:
            print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
    
    # IDF calulation method
    def __IDF(self,dataset,vocab):
        """This method returns the IDF values for all unique words of a document corpus"""
        __idfVal ={} 
        for w in vocab: # for unique words in vocabulary
            count=0 # counter for storing the nummber of documents with term w in it
            for data in dataset: # accessing each sentence in the corpus
                if w in data.split(): # accessing each word of each sentence one by one
                    count+=1
                __idfVal[w] = 1+(math.log((1+len(dataset))/(1+count))) # applying the IDF formula
        return __idfVal # returning dictionary containing idf values for all unique words
    
    # Transform method
    def transform(self,dataset,vocab,idfValues):
        """This method will return the TF-IDF vectorizer for the documents in the corpus"""
        if isinstance(dataset,list):
            for idx, row in enumerate(tqdm(dataset)): # accessing each element in the document corpus and generating a dict of type {0:word}
                word_freq = dict(Counter(row.split(" "))) # converting each row into a dict type o the form {word:freq}
                for word, freq in word_freq.items():
                    if len(word)<2: # excluding words/stopwords which are mostly of lenght less than 2 
                        continue
                    col_idx = vocab.get(word,-1) # this will return the column id from the vocab if the word exists else -1 would be returned
                    if col_idx != -1: 
                        self.__columns.append(col_idx)
                        self.__rows.append(idx)
                        tfidf = (freq/len(row.split())*idfValues[word]) # tfidf calculation [tf*idf]
                        self.__values.append(tfidf)
            #Storing in sparse matix
            sparse_matrix = csr_matrix((self.__values, (self.__rows,self.__columns)), shape=(len(dataset),len(vocab)))
            #creating a L2 normalized matirx to match-up with the sklearn result
            normalizedMAtrix = normalize(sparse_matrix,norm='l2', axis=1, copy=True, return_norm=False)
            return normalizedMAtrix
        else:
            print("Input Error: This function only accepts list as input. Pass a list of words to continue...")

我可以实现前50个功能w.r.t前50个IDF值，其中也包括重复的IDF值。我想了解我的方法是否正确，是否有更好的方法从tf idf矢量器中获得前n值

            sortedIDF = sorted(__idfValues.items(),key=lambda item:item[1],reverse=True)
            __top50idf = {}
            for i,j in sortedIDF[:50]: # converting list of tuples to dict
                __top50idf[i] = j
                
            __top50Set = set()
            for w in __top50idf.keys():
                __top50Set.add(w)
            __top50Vocab = {j:i for i,j in enumerate(__top50Set)}

['angelina', 'angela', 'angel', 'anatomist', 'amust', 'amusing', 'amazingly', 'amazed', 'amaze', 'amateurish', 'alongside', 'allowing', 'allow', 'allison', 'alike', 'alert', 'akin', 'akasha', 'aired', 'aimless', 'agreed', 'agree', 'ages', 'aged', 'afternoon', 'affleck', 'affected', 'aesthetically', 'adventure', 'adrift', 'admitted', 'admiration', 'admins', 'added', 'add', 'adams', 'actions', 'ackerman', 'achille', 'accurately', 'accurate', 'accolades', 'acclaimed', 'accessible', 'accents', 'academy', 'abstruse', 'abroad', 'abandoned', 'aailiyah']
50
[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]