Python 3.x 成对相似性排除np.array中的文档配对和观察_Python 3.x_For Loop_Nested Loops_Cosine Similarity

Python 3.x 成对相似性排除np.array中的文档配对和观察

python-3.x for-loop

Python 3.x 成对相似性排除np.array中的文档配对和观察,python-3.x,for-loop,nested-loops,cosine-similarity,Python 3.x,For Loop,Nested Loops,Cosine Similarity,具有类型为的成对相似矩阵，包含60个观察结果（文件），如下所示： [1. 0.0285112 0.1086421 0.51545511 0.02654882 0.03421325 0.06870461 0.07113071 0.10700325 0.04108417 0.04043452 0.0530722 0.05553519 0.06539258 0.02926745 0.01993073 0.20549184 0.06444957 0.02306557 0.014

具有类型为

的成对相似矩阵，包含60个观察结果（文件），如下所示：

[1.         0.0285112  0.1086421  0.51545511 0.02654882 0.03421325
 0.06870461 0.07113071 0.10700325 0.04108417 0.04043452 0.0530722
 0.05553519 0.06539258 0.02926745 0.01993073 0.20549184 0.06444957
 0.02306557 0.01456114 0.03625259 0.0626876  0.05634662 0.09667151
 0.28826583 0.32641213 0.0221891  0.03414899 0.00572818 0.03421211
 0.04580534 0.0340349  0.00458075 0.44820483 0.01758159 0.03908213
 0.03068077 0.02275432 0.04928581 0.00685353 0.06051659 0.05815797
 0.05871991 0.03184051 0.04556673 0.0778122  0.04121536 0.03094321
 0.00686861 0.02125814 0.01668948 0.03714428 0.02346818 0.02186168
 0.01369949 0.06438152 0.02852305 0.04970634 0.0768884  0.02776949]

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.cluster import KMeans
import sklearn

#Create list of documents to work with
path = "C:\\Users\\path\\dataset"
text_files = os.listdir(path)
#print (text_files)

#check we are only working with text files
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]

#Convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vectorizer = TfidfVectorizer(max_df=29)

sparse_matrix = tfidf_vectorizer.fit_transform(documents)

#create a list of labels to use later in the plot
labels = []
for f in text_files:
    if f.endswith('.txt'):
        labels.append(f)

#similarity
pairwise_similarity = sparse_matrix * sparse_matrix.T
sm = pairwise_similarity.toarray()

#write similarity to file
with open('C:\\Users\\path\\results\\results.txt', 'w') as f:
  
    for item in sm:
        for label in labels:
            f.write(label)
            f.write('\n')            
        f.write("%s\n" % item)
        f.write('\n')

然而，文档来自两个不同的数据集（每组30个文档），我对同一组文档之间的相似性对不感兴趣。因此，我只对保留不同集合文档之间的相似性值感兴趣

这实际上意味着，我将得到60个observations（文档数），60个数组，如上所述，每个数组只有30个值。给定包含文档文件名的列表

标签

，是否有一种方法可以从60个数组中排除相同的文档对？有没有其他方法可以做到这一点

到目前为止，我的代码如下所示：

[1.         0.0285112  0.1086421  0.51545511 0.02654882 0.03421325
 0.06870461 0.07113071 0.10700325 0.04108417 0.04043452 0.0530722
 0.05553519 0.06539258 0.02926745 0.01993073 0.20549184 0.06444957
 0.02306557 0.01456114 0.03625259 0.0626876  0.05634662 0.09667151
 0.28826583 0.32641213 0.0221891  0.03414899 0.00572818 0.03421211
 0.04580534 0.0340349  0.00458075 0.44820483 0.01758159 0.03908213
 0.03068077 0.02275432 0.04928581 0.00685353 0.06051659 0.05815797
 0.05871991 0.03184051 0.04556673 0.0778122  0.04121536 0.03094321
 0.00686861 0.02125814 0.01668948 0.03714428 0.02346818 0.02186168
 0.01369949 0.06438152 0.02852305 0.04970634 0.0768884  0.02776949]

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.cluster import KMeans
import sklearn

#Create list of documents to work with
path = "C:\\Users\\path\\dataset"
text_files = os.listdir(path)
#print (text_files)

#check we are only working with text files
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]

#Convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vectorizer = TfidfVectorizer(max_df=29)

sparse_matrix = tfidf_vectorizer.fit_transform(documents)

#create a list of labels to use later in the plot
labels = []
for f in text_files:
    if f.endswith('.txt'):
        labels.append(f)

#similarity
pairwise_similarity = sparse_matrix * sparse_matrix.T
sm = pairwise_similarity.toarray()

#write similarity to file
with open('C:\\Users\\path\\results\\results.txt', 'w') as f:
  
    for item in sm:
        for label in labels:
            f.write(label)
            f.write('\n')            
        f.write("%s\n" % item)
        f.write('\n')

请注意，最后的

#write similarity to file

语句存在语义错误，这也是一个难题