Python rity_文件（r'）） #打印有关相似性矩阵文件的一些摘要统计信息打印“示例数：%d”%（W.shape[0]）打印“Sim矩阵：nnz=%d，密度=%.2f%，每个示例的邻域平均数：%.2f”%（W.nnz，100*（float（W.nnz）/（W_Python_Numpy_Multiprocessing

Python rity_文件（r'）） #打印有关相似性矩阵文件的一些摘要统计信息打印“示例数：%d”%（W.shape[0]）打印“Sim矩阵：nnz=%d，密度=%.2f%，每个示例的邻域平均数：%.2f”%（W.nnz，100*（float（W.nnz）/（W

python numpy

Python rity_文件（r'）） #打印有关相似性矩阵文件的一些摘要统计信息打印“示例数：%d”%（W.shape[0]）打印“Sim矩阵：nnz=%d，密度=%.2f%，每个示例的邻域平均数：%.2f”%（W.nnz，100*（float（W.nnz）/（W,python,numpy,multiprocessing,Python,Numpy,Multiprocessing,rity_文件（r'）） #打印有关相似性矩阵文件的一些摘要统计信息打印“示例数：%d”%（W.shape[0]）打印“Sim矩阵：nnz=%d，密度=%.2f%，每个示例的邻域平均数：%.2f”%（W.nnz，100*（float（W.nnz）/（W.shape[0]**2）），float（W.nnz）/W.shape[0]） ########################################

rity_文件（r'）） #打印有关相似性矩阵文件的一些摘要统计信息打印“示例数：%d”%（W.shape[0]）打印“Sim矩阵：nnz=%d，密度=%.2f%，每个示例的邻域平均数：%.2f”%（W.nnz，100*（float（W.nnz）/（W.shape[0]**2）），float（W.nnz）/W.shape[0]） ######################################## #加载标签 ######################################## def转换器标签（标签）：唯一标签=np.唯一（标签） label_dict={} idx=0 对于唯一_标签中的标签： label\u dict[label]=idx idx+=1 返回标签 LABELS=np.load（LABELS\u文件）打印“唯一标签数：%d”%（np.unique（标签）.shape）标签=转换标签（标签）新标签=np.array（[label\u dict[label]用于标签中的标签]）数据集大小=标签。形状[0] 标签=新标签 W=W+alpha*sp.identity（数据集大小） ######################################## #定义已标记和未标记的IDX ######################################## def make_test_set（）： idx=np.random.rand（数据集大小） l=（idx<标签百分比） u=（idx>=标签百分比）返回l，u 五十、 U=制作测试集（） def createRDistribution（标签）： rows=np.array（范围（0，数据集大小），dtype=int） label\u idx=np.where（~label\u bool）行=np.delete（行，标签\u idx） cols=np.delete（标签、标签和idx） VAL=np.ones（（rows.shape[0]，1））.ravel（） sparseR=sp.csc_矩阵（（vals，（rows，cols）），shape=（数据集大小，最大（标签）+1））返回稀疏器 ######################################## #对数据进行分发 ######################################## R=createRDistribution（L，标签）#标签分布稀疏 classNoLabel=np，其中（R.sum（0）==0） #print classNoLabel#需要计算出labeld集合中有多少类没有表示 Q0=np.0（（数据集大小，最大（标签）+1），dtype=np.double） Q0+=1.0/Q0.形状[1] Q1=np.0（（数据集大小，最大（标签）+1），dtype=np.double） P0=np.0（（数据集大小，最大（标签）+1），数据类型=np.double） P1=np.0（（数据集大小，最大（标签）+1），dtype=np.double） def gamma（W#u sum，i）：#W#u sum是sim矩阵W的所有列的总和返回v+mu*W_和[i] def更新_p（startIdx、endIdx、betaMatrix、out_q）： p=np.零（（endIdx startIdx，max（标签）+1）） W_colsum=W.sum（1）对于x范围内的i（startIdx，endIdx）：#对于i 0到numExamples-1 dist=np.zero（最大（标签）+1，dtype=np.float64） g=伽马（W_colsum，i） dist=np.exp（betaMatrix[i，：]/g） dist/=np.和（dist） p[i-startIdx，：]=dist 输出（p） def update_q（startIdx、endIdx、sumMatrix、out_q）： q=np.零（（endIdx startIdx，max（标签）+1）） W_rowsum=W.sum（0）对于X范围内的i（startIdx，endIdx）： dist=np.zero（最大（标签）+1，dtype=np.float64）如果L[i]为0，则标记为1 dist=（R[i，：]*标记为+mu*sumMatrix[i，：]）/（标记为+mu*W_rowsum[0，i]） dist/=np.和（dist） q[i-startIdx，：]=dist 输出（q） def update_pq_包装（curIter、chunksize、mat、update_step='p'）： startIdx=xrange（0，数据集大小，块大小）普雷维特=1-库里特过程=[] out_q=队列（）对于范围内的i（ncores）：#现在，启动每个组装矩阵的子进程

def update_p(startIdx, endIdx, betaMatrix, out_q):
  p = np.zeros((endIdx-startIdx, max(LABELS)+1))
  W_colsum = W.sum(1)
  for i in xrange(startIdx, endIdx): #for i 0 to numExamples - 1                                                                                                                                                                            
    dist = np.zeros(max(LABELS)+1, dtype=np.float64)                                                                                                                                                           
    g = gamma(W_colsum, i)
    dist = np.exp(betaMatrix[i,:] / g)
    dist /= np.sum(dist)
    p[i-startIdx,:] = dist
  out_q.put(p)

def update_q(startIdx, endIdx, sumMatrix, out_q):
  q = np.zeros((endIdx-startIdx, max(LABELS)+1))
  W_rowsum = W.sum(0)
  for i in xrange(startIdx, endIdx):
    dist = np.zeros(max(LABELS)+1, dtype=np.float64)
    labeled = 1 if L[i] else 0
    dist = (R[i,:]*labeled + mu*sumMatrix[i,:]) / (labeled + mu*W_rowsum[0,i])
    dist /= np.sum(dist)
    q[i-startIdx,:] = dist
  out_q.put(q)

def update_pq_wrapper(curIter, chunksize, mat, update_step='p'):
  startIdx = xrange(0, dataset_size, chunksize)
  prevIter = 1 - curIter
  procs = []
  out_q = Queue()
  for i in range(ncores):    #now, start each of the child processes that assembles the matrices individually                                                                                                                               
    start = startIdx[i]
    end = dataset_size if i == ncores-1 else startIdx[i+1]
    if update_step == 'p':
        proc = Process(target=update_p, args=(start, end, mat, out_q))
    else:
        proc = Process(target=update_q, args=(start, end, mat, out_q))
    procs.append(proc)
    proc.start()
  if update_step == 'p':    #once completed, collect results                                                                                                                                                                                
    distMat = P1 if curIter else P0
  else:
    distMat = Q1 if curIter else Q0
  for i in range(ncores):
    p_chunk = out_q.get()
    start = startIdx[i]
    end = dataset_size if i == ncores-1 else startIdx[i+1]
    distMat[start:end,:] = p_chunk
  for proc in procs:
    proc.join()

#!/usr/bin/python
import sys, commands, string, cPickle
import numpy as np
import scipy.sparse as sp
import scipy.stats as stats
import scipy.linalg as la
from math import ceil
from time import clock
from multiprocessing import Process, Queue
from Queue import Empty

np.random.seed(42)
if not len(sys.argv) == 9:
  print 'ERROR: Usage: python alternating_minimization.py <binary data or sim matrix> <labels_file> <iterations> <num cores> <label percent> <v> <mu> <alpha>'
  sys.exit()

########################################                                                                                                                                                                                                      
# Main Parameters                                                                                                                                                                                                                             
########################################                                                                                                                                                                                                      
similarity_file = sys.argv[1] #output of simgraph_construction.py                                                                                                                                                                             
labels_file = sys.argv[2]
niterations = int(sys.argv[3])
ncores = int(sys.argv[4])

########################################                                                                                                                                                                                                      
# meta parameters                                                                                                                                                                                                                             
########################################                                                                                                                                                                                                      
label_percent = float(sys.argv[5])
v = float(sys.argv[6])
mu = float(sys.argv[7])
alpha = float(sys.argv[8])

########################################                                                                                                                                                                                                      
# load the data file (output of simgraph_construction.py) which is already in numpy format                                                                                                                                                    
########################################                                                                                                                                                                                                      
W = cPickle.load(open(similarity_file, 'r'))
#print some summary statistics about the similarity matrix file                                                                                                                                                                               
print "Number of examples: %d"%(W.shape[0])
print "Sim Matrix: nnz = %d, density = %.2f percent, average # of neighbors per example: %.2f"%(W.nnz, 100*(float(W.nnz)/(W.shape[0]**2)), float(W.nnz)/W.shape[0])

########################################                                                                                                                                                                                                      
# load the labels                                                                                                                                                                                                                             
########################################                                                                                                                                                                                                      
def convertLabels(labels):
  unique_labels = np.unique(labels)
  label_dict = {}
  idx = 0
  for label in unique_labels:
    label_dict[label] = idx
    idx += 1
  return label_dict

LABELS = np.load(labels_file)
print "Number of unique labels: %d"%(np.unique(LABELS).shape)
label_dict = convertLabels(LABELS)
NEW_LABELS = np.array([label_dict[label] for label in LABELS])
dataset_size = LABELS.shape[0]
LABELS = NEW_LABELS
W = W + alpha*sp.identity(dataset_size)

########################################                                                                                                                                                                                                      
# define the labeled and unlabeled idxs                                                                                                                                                                                                       
########################################                                                                                                                                                                                                      
def make_test_set():
  idx = np.random.rand(dataset_size)
  l = (idx < label_percent)
  u = (idx >= label_percent)
  return l,u

L,U = make_test_set()
def createRDistribution(label_bool, labels):
  rows = np.array(range(0, dataset_size), dtype=int)
  label_idx = np.where(~label_bool)
  rows = np.delete(rows, label_idx)
  cols = np.delete(labels, label_idx)
  vals = np.ones((rows.shape[0],1)).ravel()
  sparseR = sp.csc_matrix((vals, (rows, cols)), shape=(dataset_size, max(labels)+1))
  return sparseR

########################################                                                                                                                                                                                                      
# make the distributions for the data                                                                                                                                                                                                         
########################################                                                                                                                                                                                                      
R = createRDistribution(L, LABELS) #labeled distribution is sparse                                                                                                                                                                            
classNoLabel = np.where(R.sum(0) == 0)
#print classNoLabel #need to figure out how many classes are unrepresented in the labeld set                                                                                                                                                  
Q0 = np.zeros((dataset_size, max(LABELS)+1), dtype=np.double)
Q0 += 1.0 / Q0.shape[1]
Q1 = np.zeros((dataset_size, max(LABELS)+1), dtype=np.double)
P0 = np.zeros((dataset_size, max(LABELS)+1), dtype=np.double)
P1 = np.zeros((dataset_size, max(LABELS)+1), dtype=np.double)

def gamma(W_sum,i): #W_sum is sum across all columns of sim matrix W                                                                                                                                                                          
  return v + mu * W_sum[i]

def update_p(startIdx, endIdx, betaMatrix, out_q):
  p = np.zeros((endIdx-startIdx, max(LABELS)+1))
  W_colsum = W.sum(1)
  for i in xrange(startIdx, endIdx): #for i 0 to numExamples - 1                                                                                                                                                                            
    dist = np.zeros(max(LABELS)+1, dtype=np.float64)                                                                                                                                                           
    g = gamma(W_colsum, i)
    dist = np.exp(betaMatrix[i,:] / g)
    dist /= np.sum(dist)
    p[i-startIdx,:] = dist
  out_q.put(p)

def update_q(startIdx, endIdx, sumMatrix, out_q):
  q = np.zeros((endIdx-startIdx, max(LABELS)+1))
  W_rowsum = W.sum(0)
  for i in xrange(startIdx, endIdx):
    dist = np.zeros(max(LABELS)+1, dtype=np.float64)
    labeled = 1 if L[i] else 0
    dist = (R[i,:]*labeled + mu*sumMatrix[i,:]) / (labeled + mu*W_rowsum[0,i])
    dist /= np.sum(dist)
    q[i-startIdx,:] = dist
  out_q.put(q)

def update_pq_wrapper(curIter, chunksize, mat, update_step='p'):
  startIdx = xrange(0, dataset_size, chunksize)
  prevIter = 1 - curIter
  procs = []
  out_q = Queue()
  for i in range(ncores):    #now, start each of the child processes that assembles the matrices individually                                                                                                                               
    start = startIdx[i]
    end = dataset_size if i == ncores-1 else startIdx[i+1]
    if update_step == 'p':
        proc = Process(target=update_p, args=(start, end, mat, out_q))
    else:
        proc = Process(target=update_q, args=(start, end, mat, out_q))
    procs.append(proc)
    proc.start()
  if update_step == 'p':    #once completed, collect results                                                                                                                                                                                
    distMat = P1 if curIter else P0
  else:
    distMat = Q1 if curIter else Q0
  for i in range(ncores):
    p_chunk = out_q.get()
    start = startIdx[i]
    end = dataset_size if i == ncores-1 else startIdx[i+1]
    distMat[start:end,:] = p_chunk
  for proc in procs:
    proc.join()

def compute_tvdist(P,Q):
  tv_dist = 0
  for i in range(0, dataset_size):
    tv_dist += max(np.absolute(P[i,:] - Q[i,:]))
  return tv_dist/dataset_size

def main(argv):
  accuracyArr = []
  tvdistArr = []
  print >> sys.stderr, 'Starting %d iterations...' % niterations
  chunksize = int(ceil(dataset_size/float(ncores)))
  for n in xrange(1,niterations+1):
    print >> sys.stderr, 'Iteration %d' % n
    idx = n % 2
    q_prev = Q1 if not idx else Q0
    p_cur = P1 if idx else P0
    #print q_prev                                                                                                                                                                                                                         
    start_time = clock()
    mat = -v + mu*(W*(np.log(q_prev)-1))
    end_time = clock()
    #print mat                                                                                                                                                                                                                            
    print "Time taken to compute Beta Matrix: %.2f seconds"%(end_time-start_time)
    start_time=clock()
    update_pq_wrapper(idx, chunksize, mat, 'p')
    end_time=clock()
    print "Time taken to update P matrix: %.2f seconds"%(end_time-start_time)
    if not n == niterations:
      start_time = clock()
      mat = W.T*p_cur
      end_time = clock()
      print "Time taken to compute Sum Matrix: %.2f seconds"%(end_time-start_time)
      start_time = clock()
      update_pq_wrapper(idx, chunksize, mat, 'q')
      end_time = clock()
      print "Time taken to update Q matrix: %.2f seconds"%(end_time-start_time)
    ## Evaluation ##                                                                                                                                                                                                                      
    evalMat = P1 if idx else P0                                                                                                                                                   
    predLabel = np.argmax(evalMat, axis=1) #gives the index (column)                                                                                                                                                                      
    accuracy = float(np.sum(predLabel[np.where(U)] == LABELS[np.where(U)]) )/ LABELS[np.where(U)].shape[0]
    print "Accuracy: %.2f"%(accuracy*100)
    accuracyArr.append(accuracy)
    totalVar = []
    if n != niterations:
        tv_dist = compute_tvdist(P1, Q1) if idx else compute_tvdist(P0, Q0)
    else:
        tv_dist = compute_tvdist(P1, Q0) if idx else compute_tvdist(P0, Q1)
    print "Average Total Variation Distance is %.3f"%(tv_dist)
    tvdistArr.append(tv_dist)
  print "Summary of final probability density matrix: "
  print evalMat
  print '\t'.join([str(round(acc,4)) for acc in accuracyArr])