分层聚类NLP中每簇N个最频繁关键词的提取
我想从凝聚层次聚类的结果中提取每个聚类中n个最频繁的关键字分层聚类NLP中每簇N个最频繁关键词的提取,nlp,cluster-analysis,Nlp,Cluster Analysis,我想从凝聚层次聚类的结果中提取每个聚类中n个最频繁的关键字 def agglomerative_clustering(tfidf_matrix): cluster = AgglomerativeClustering(n_clusters=95, affinity='euclidean', linkage='ward') cluster.fit_predict(tfidf_matrix) print(cluster.n_clusters_) labels=clust
def agglomerative_clustering(tfidf_matrix):
cluster = AgglomerativeClustering(n_clusters=95, affinity='euclidean', linkage='ward')
cluster.fit_predict(tfidf_matrix)
print(cluster.n_clusters_)
labels=cluster.labels_
print("lables is "+str(labels.shape))
#labels = list(labels)[0]
print("test"+str(labels))
return labels
def tfidf(data):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
return vectors,feature_names
vectors,terms=tfidf(cleaned_documents)
labels =agglomerative_clustering(vectors.toarray())
lib['cleaned_documents'] = pd.Series(cleaned_documents)
lib['clusterAgglomerative']= pd.Series(labels)
X = pd.DataFrame(vectorized_data.toarray(),lib['cleaned_documents']) # columns argument is optional
X['Cluster'] = labels
# Add column corresponding to cluster number
word_frequencies_by_cluster = X.groupby('Cluster').sum()
# To get sorted list for a numbered cluster, in this case 1
print("Top terms per cluster:")
print(word_frequencies_by_cluster.loc[2, :].sort_values(ascending=False))
结果我希望每个集群都有N个最常见的关键字?我尝试了这个解决方案,但似乎效率不高
df_lib = pd.DataFrame(lib['cleaned_documents'],lib['clusterAgglomerative'])
print(df_lib)
grouped_df = df_lib.groupby("clusterAgglomerative")
grouped_lists = (grouped_df["cleaned_documents"]).agg(lambda column: ", ".join(set(column)))
print("keywords per cluster")
print(grouped_lists)