Python 从稀疏.coo_矩阵中采样n个零
如何(有效地)从scipy.sparse.coo_矩阵中采样零值Python 从稀疏.coo_矩阵中采样n个零,python,numpy,scipy,sparse-matrix,Python,Numpy,Scipy,Sparse Matrix,如何(有效地)从scipy.sparse.coo_矩阵中采样零值 >>> import numpy as np >>> from scipy.sparse import coo_matrix >>> # create sparse array >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> X_sparse = coo_matrix(X) >
>>> import numpy as np
>>> from scipy.sparse import coo_matrix
>>> # create sparse array
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> X_sparse = coo_matrix(X)
>>> # randomly sample 0's from X_sparse, retrieving as [(row, col), (row_col)]
>>> def sample_zeros(sp_arr, n, replacement=False):
>>> # ???
>>> return negs
>>> zero_indices = sample_zeros(X_sparse, n=3, replacement=False)
>>> print(zero_indices)
[(0, 1), (2, 0), (2, 1)]
效率在这里很重要,因为我将在一个为神经网络提供反馈的迭代器中完成这项工作 我认为没有一种有效的方法可以利用稀疏矩阵结构:
In [197]: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
...: >>> X_sparse = sparse.coo_matrix(X)
In [198]: X_sparse
Out[198]:
<3x2 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
In [199]: print(X_sparse)
(0, 0) 1.0
(1, 0) 2.0
(1, 1) 1.0
我们可以求稀疏矩阵的所有0:
In [209]: X_sparse==0
/usr/local/lib/python3.6/dist-packages/scipy/sparse/compressed.py:214: SparseEfficiencyWarning: Comparing a sparse matrix with 0 using == is inefficient, try using != instead.
", try using != instead.", SparseEfficiencyWarning)
Out[209]:
<3x2 sparse matrix of type '<class 'numpy.bool_'>'
with 3 stored elements in Compressed Sparse Row format>
In [210]: print(_)
(0, 1) True
(2, 0) True
(2, 1) True
[209]中的:X_sparse==0
/usr/local/lib/python3.6/dist packages/scipy/sparse/compressed.py:214:SparseEfficiencyWarning:使用==将稀疏矩阵与0进行比较效率低下,请尝试使用!=相反
“,请尝试改用!=。”,SparSeeEfficiencyWarning)
出[209]:
在[210]中:打印
(0,1)正确
(2,0)正确
(2,1)正确
我认为没有一种有效的方法可以利用稀疏矩阵结构:
In [197]: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
...: >>> X_sparse = sparse.coo_matrix(X)
In [198]: X_sparse
Out[198]:
<3x2 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
In [199]: print(X_sparse)
(0, 0) 1.0
(1, 0) 2.0
(1, 1) 1.0
我们可以求稀疏矩阵的所有0:
In [209]: X_sparse==0
/usr/local/lib/python3.6/dist-packages/scipy/sparse/compressed.py:214: SparseEfficiencyWarning: Comparing a sparse matrix with 0 using == is inefficient, try using != instead.
", try using != instead.", SparseEfficiencyWarning)
Out[209]:
<3x2 sparse matrix of type '<class 'numpy.bool_'>'
with 3 stored elements in Compressed Sparse Row format>
In [210]: print(_)
(0, 1) True
(2, 0) True
(2, 1) True
[209]中的:X_sparse==0
/usr/local/lib/python3.6/dist packages/scipy/sparse/compressed.py:214:SparseEfficiencyWarning:使用==将稀疏矩阵与0进行比较效率低下,请尝试使用!=相反
“,请尝试改用!=。”,SparSeeEfficiencyWarning)
出[209]:
在[210]中:打印
(0,1)正确
(2,0)正确
(2,1)正确
由于您知道X的形状,因此可以使用np.random.choice
生成
随机(行,列)
位置在X
:
h, w = X.shape
rows = np.random.choice(h, size=n)
cols = np.random.choice(w, size=n)
主要困难在于如何检查(行,列)
是否是X
中的非零位置。
有一种方法可以做到这一点:在X
非零时,创建一个新的稀疏X
,它等于1。
接下来,创建一个新的稀疏矩阵,Y
,在上面生成的随机位置具有非零值。然后减去:
当X
非零时,该稀疏矩阵Y
将为零。
因此,如果我们已经设法在Y
中生成了足够多的非零值,那么我们可以使用它们的(行,列)
位置作为示例的返回值:
import unittest
import sys
import numpy as np
import scipy.sparse as sparse
def sample_negs(X, n=3, replace=False):
N = np.prod(X.shape)
m = N - X.size
if n == 0:
result = []
elif (n < 0) or (not replace and m < n) or (replace and m == 0):
raise ValueError("{n} samples from {m} locations do not exist"
.format(n=n, m=m))
elif n/m > 0.5:
# Y (in the else clause, below) would be pretty dense so there would be no point
# trying to use sparse techniques. So let's use hpaulj's idea
# (https://stackoverflow.com/a/53577267/190597) instead.
import warnings
warnings.filterwarnings("ignore", category=sparse.SparseEfficiencyWarning)
Y = sparse.coo_matrix(X == 0)
rows = Y.row
cols = Y.col
idx = np.random.choice(len(rows), size=n, replace=replace)
result = list(zip(rows[idx], cols[idx]))
else:
X_row, X_col = X.row, X.col
X_data = np.ones(X.size)
X = sparse.coo_matrix((X_data, (X_row, X_col)), shape=X.shape)
h, w = X.shape
Y = sparse.coo_matrix(X.shape)
Y_size = 0
while Y_size < n:
m = n - Y.size
Y_data = np.concatenate([Y.data, np.ones(m)])
Y_row = np.concatenate([Y.row, np.random.choice(h, size=m)])
Y_col = np.concatenate([Y.col, np.random.choice(w, size=m)])
Y = sparse.coo_matrix((Y_data, (Y_row, Y_col)), shape=X.shape)
# Remove values in Y where X is nonzero
# This also consolidates (row, col) duplicates
Y = sparse.coo_matrix(Y - X.multiply(Y))
if replace:
Y_size = Y.data.sum()
else:
Y_size = Y.size
if replace:
rows = np.repeat(Y.row, Y.data.astype(int))
cols = np.repeat(Y.col, Y.data.astype(int))
idx = np.random.choice(rows.size, size=n, replace=False)
result = list(zip(rows[idx], cols[idx]))
else:
rows = Y.row
cols = Y.col
idx = np.random.choice(rows.size, size=n, replace=False)
result = list(zip(rows[idx], cols[idx]))
return result
class Test(unittest.TestCase):
def setUp(self):
import warnings
warnings.filterwarnings("ignore", category=sparse.SparseEfficiencyWarning)
self.ncols, self.nrows = 100, 100
self.X = sparse.random(self.ncols, self.nrows, density=0.05, format='coo')
Y = sparse.coo_matrix(self.X == 0)
self.expected = set(zip(Y.row, Y.col))
def test_n_too_large(self):
self.assertRaises(ValueError, sample_negs, self.X, n=100*100+1, replace=False)
X_dense = sparse.coo_matrix(np.ones((4,2)))
self.assertRaises(ValueError, sample_negs, X_dense, n=1, replace=True)
def test_no_replacement(self):
for m in range(100):
negative_list = sample_negs(self.X, n=m, replace=False)
negative_set = set(negative_list)
self.assertEqual(len(negative_list), m)
self.assertLessEqual(negative_set, self.expected)
def test_no_repeats_when_replace_is_false(self):
negative_list = sample_negs(self.X, n=10, replace=False)
self.assertEqual(len(negative_list), len(set(negative_list)))
def test_dense_replacement(self):
N = self.ncols * self.nrows
m = N - self.X.size
for i in [-1, 0, 1]:
negative_list = sample_negs(self.X, n=m+i, replace=True)
negative_set = set(negative_list)
self.assertEqual(len(negative_list), m+i)
self.assertLessEqual(negative_set, self.expected)
def test_sparse_replacement(self):
for m in range(100):
negative_list = sample_negs(self.X, n=m, replace=True)
negative_set = set(negative_list)
self.assertEqual(len(negative_list), m)
self.assertLessEqual(negative_set, self.expected)
if __name__ == '__main__':
sys.argv.insert(1,'--verbose')
unittest.main(argv = sys.argv)
导入单元测试
导入系统
将numpy作为np导入
将scipy.sparse导入为稀疏
def样本(X,n=3,replace=False):
N=np.产品(X.形状)
m=N-X.size
如果n==0:
结果=[]
elif(n<0)或(不替换且m0.5:
#Y(在下面的else子句中)将非常密集,因此没有意义
#尝试使用稀疏技术。让我们用hpaulj的想法
# (https://stackoverflow.com/a/53577267/190597)相反。
进口警告
警告。过滤器警告(“忽略”,类别=稀疏。稀疏有效警告)
Y=稀疏.coo_矩阵(X==0)
rows=Y.row
cols=Y.col
idx=np.random.choice(len(行),size=n,replace=replace)
结果=列表(zip(行[idx],列[idx]))
其他:
X_行,X_列=X.row,X.col
X_数据=np.one(X.size)
X=sparse.coo_矩阵((X_数据,(X_行,X_列)),shape=X.shape)
h、 w=X.形状
Y=稀疏.coo_矩阵(X.shape)
Y_尺寸=0
当Y_尺寸