Python 从头开始实施tf idf以返回前n个特征w.r.t前n个idf值
我目前正在学习TF-IDF矢量器,并试图从头开始实现TD-IDF,以便更好地理解。我创建了下面的类来实现TF-IDF矢量器 下面的类基本上有两个主要方法。fit()方法从文档语料库中找出唯一的单词,并计算这些元素的相关IDF值。transform()方法为语料库中的文档生成td idf矢量器Python 从头开始实施tf idf以返回前n个特征w.r.t前n个idf值,python,scikit-learn,tf-idf,tfidfvectorizer,Python,Scikit Learn,Tf Idf,Tfidfvectorizer,我目前正在学习TF-IDF矢量器,并试图从头开始实现TD-IDF,以便更好地理解。我创建了下面的类来实现TF-IDF矢量器 下面的类基本上有两个主要方法。fit()方法从文档语料库中找出唯一的单词,并计算这些元素的相关IDF值。transform()方法为语料库中的文档生成td idf矢量器 from collections import Counter from tqdm import tqdm from scipy.sparse import csr_matrix import math i
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np
class TFIDFVectoriser_:
# class initialisation
def __init__(self):
"""This is the initialisation method of the TFIDFVectoriser class"""
self.__uniqueSet = set() # Set to store the unique words form the document corpus
self.__uniqueVocab = {} # Dict to store the unique words form the document corpus as keys and column nunber as values
self.__rows = [] # For storing index of the document
self.__columns =[] # For storing the dimensions of the word
self.__values = [] # For storing the freqeny of the word
#Fit method
def fit(self,dataset):
"""This method will return the set of unique words from the input list and will map them with a column number"""
if isinstance(dataset,list):
for data in tqdm(dataset): # accessing each row in the dataset
for w in data.split(" "): # accessing each word in each row
if len(w) < 2: # excluding words/stopwords which are mostly of lenght less than 2
continue
self.__uniqueSet.add(w)
self.__uniqueSet = sorted(list(self.__uniqueSet)) # storing the final set of unique words in a srted list
self.__uniqueVocab = {j:i for i,j in enumerate(self.__uniqueSet)} # creating a dict() of unique elements and column numbers
__idfValues = self.__IDF(dataset,self.__uniqueVocab) # calling function to generate the IDF values
return self.__uniqueVocab, __idfValues #returning unique words and theie idf values
else:
print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
# IDF calulation method
def __IDF(self,dataset,vocab):
"""This method returns the IDF values for all unique words of a document corpus"""
__idfVal ={}
for w in vocab: # for unique words in vocabulary
count=0 # counter for storing the nummber of documents with term w in it
for data in dataset: # accessing each sentence in the corpus
if w in data.split(): # accessing each word of each sentence one by one
count+=1
__idfVal[w] = 1+(math.log((1+len(dataset))/(1+count))) # applying the IDF formula
return __idfVal # returning dictionary containing idf values for all unique words
# Transform method
def transform(self,dataset,vocab,idfValues):
"""This method will return the TF-IDF vectorizer for the documents in the corpus"""
if isinstance(dataset,list):
for idx, row in enumerate(tqdm(dataset)): # accessing each element in the document corpus and generating a dict of type {0:word}
word_freq = dict(Counter(row.split(" "))) # converting each row into a dict type o the form {word:freq}
for word, freq in word_freq.items():
if len(word)<2: # excluding words/stopwords which are mostly of lenght less than 2
continue
col_idx = vocab.get(word,-1) # this will return the column id from the vocab if the word exists else -1 would be returned
if col_idx != -1:
self.__columns.append(col_idx)
self.__rows.append(idx)
tfidf = (freq/len(row.split())*idfValues[word]) # tfidf calculation [tf*idf]
self.__values.append(tfidf)
#Storing in sparse matix
sparse_matrix = csr_matrix((self.__values, (self.__rows,self.__columns)), shape=(len(dataset),len(vocab)))
#creating a L2 normalized matirx to match-up with the sklearn result
normalizedMAtrix = normalize(sparse_matrix,norm='l2', axis=1, copy=True, return_norm=False)
return normalizedMAtrix
else:
print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
我可以实现前50个功能w.r.t前50个IDF值,其中也包括重复的IDF值。
我想了解我的方法是否正确,是否有更好的方法从tf idf矢量器中获得前n值
sortedIDF = sorted(__idfValues.items(),key=lambda item:item[1],reverse=True)
__top50idf = {}
for i,j in sortedIDF[:50]: # converting list of tuples to dict
__top50idf[i] = j
__top50Set = set()
for w in __top50idf.keys():
__top50Set.add(w)
__top50Vocab = {j:i for i,j in enumerate(__top50Set)}
['angelina', 'angela', 'angel', 'anatomist', 'amust', 'amusing', 'amazingly', 'amazed', 'amaze', 'amateurish', 'alongside', 'allowing', 'allow', 'allison', 'alike', 'alert', 'akin', 'akasha', 'aired', 'aimless', 'agreed', 'agree', 'ages', 'aged', 'afternoon', 'affleck', 'affected', 'aesthetically', 'adventure', 'adrift', 'admitted', 'admiration', 'admins', 'added', 'add', 'adams', 'actions', 'ackerman', 'achille', 'accurately', 'accurate', 'accolades', 'acclaimed', 'accessible', 'accents', 'academy', 'abstruse', 'abroad', 'abandoned', 'aailiyah']
50
[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]