Python TypeError:AdvancedBooleanSubensor.grad非法返回整数值变量。(输入索引0,数据类型int64)
我确实在Theano中遇到了这个问题,主要问题是Theano中的Scan()运算符,这导致AdvancedBooleanSubensor.grad非法返回了一个整数值变量。如果有任何帮助,我将不胜感激 这是我提取的代码。(如果断开的_输入设置为默认的“raise”,则将引发ano.gradient.DisconnectedInputError)。我的Theano版本是:1.0.1Python TypeError:AdvancedBooleanSubensor.grad非法返回整数值变量。(输入索引0,数据类型int64),python,theano,Python,Theano,我确实在Theano中遇到了这个问题,主要问题是Theano中的Scan()运算符,这导致AdvancedBooleanSubensor.grad非法返回了一个整数值变量。如果有任何帮助,我将不胜感激 这是我提取的代码。(如果断开的_输入设置为默认的“raise”,则将引发ano.gradient.DisconnectedInputError)。我的Theano版本是:1.0.1 import numpy as np from theano import tensor import theano
import numpy as np
from theano import tensor
import theano
from collections import OrderedDict
theano.config.floatX = "float32"
profile=False
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndim,W=US(V*H)
return u.astype(theano.config.floatX)
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
else:
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables
return tparams
def itemlist(tparams):
return [vv for kk, vv in tparams.items()]
# params
params = OrderedDict() # Create an ordered dictionary
context_mask = np.array([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_src = 7 , n_samples = 5
mask = np.array([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_trg = 7 , n_samples = 5
D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6
params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted
cc_=0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4
context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask
mask = theano.shared(mask)
cc_ = theano.shared(cc_)
# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)
params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)
# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim
params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)
params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)
tparams = init_tparams(params)
def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()
state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx']
# state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b']
# state_below_ = emb*tparams['W']+tparams['b']
def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_nl, Ux_nl, b_nl, bx_nl):
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)
preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1)
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim
srclen = (context_mask1.sum(0,keepdims=True)- 1).T # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__) + 1 # +1:eos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3
unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[unmaskedId,:] # (batchsize-x)*1
srclen = srclen[unmaskedId,:] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=1)
indicesAll += tensor.mgrid[0:unmaskedId.shape[0], 0:numPositions][1] # (batchsize-x)*numPositions
indicesAll = indicesAll.T.flatten() # 1*(numPositions*(batchsize-x))
# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= tensor.tile(srclen, [numPositions, 1]).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reduction,1*((numPosition*batchsize-x)-y)
indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,,1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)
srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1* ((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1* ((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :],\
cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])
e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize
scaleX = (indicesAll - tensor.tile(pstate_[unmaskedId], (numPositions,1)).flatten())/(D/2)
# unmaskedIds.shape[0]/n_samples即((numPosition*batchsize-x)-y) /batchsize
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],\
scaleX) # batchSize numPositions
alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize
context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_mask,nsteps_src * batchSize,be truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # ∑eij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/∑eij
# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)
preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T
seqs = [context_mask, mask, state_below_, state_belowx]
shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]
rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)
proj_h = rval[0]
ctxs = rval[1]
# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+\
tparams['ff_logit_lstm_b']
logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+\
tparams['ff_logit_prev_b']
logit_ctx =tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+\
tparams['ff_logit_ctx_b']
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)
# make mean(), tensor->scalar
cost = cost.mean()
print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')
将numpy导入为np
从无输入张量
进口茶
从集合导入订单
theano.config.floatX=“float32”
profile=False
def正交重量(ndim):
W=np.random.randn(ndim,ndim)
u、 s,v=np.linalg.svd(W)#ndim*ndim的矩阵W的奇异值分解,W=US(v*H)
返回u.astype(theano.config.floatX)
def标准重量(nin,nout=无,刻度=0.01,正交=真):
如果nout为None:
零=年
如果nout==nin和正交:
W=正交权重(nin)#在奇异值分解后返回nin*nin大小的酉矩阵
其他:
W=标度*np.random.randn(nin,nout)#随机生成nin*nout的numpy矩阵,标度里面的所有项目
返回W.astype(theano.config.floatX)
def初始图(参数):
tparams=OrderedDict()
对于kk,params.items()中的pp:
tparams[kk]=theano.shared(参数[kk],名称=kk)#共享变量
返回图
def项目列表(TPARMS):
返回[kk的vv,tparams.items()中的vv]
#params
params=OrderedDict()#创建一个有序字典
上下文掩码=np.数组([[1,1,1,1,1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1,0,0,0,0.].astype(theano.config.floatX)#nsteps_src=7,n_samples=5
掩码=np.数组([[1,1,1,1,1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1,0,0,0,0.].astype(theano.config.floatX)#nsteps_trg=7,n_samples=5
D=1
数量=2*D+1#3
nsteps_src=context_mask.shape[0]
nsteps_trg=掩码.形状[0]
n_samples=context_mask.shape[1]
n_words=nsteps_trg*n_样本
dim_word=6
参数['Wemb_dec']=标准重量(n字,尺寸字)#35*6
emb=theano.shared(参数['Wemb_dec'])#shard变量
emb=emb.重塑([nsteps\u trg,n\u样本,dim\u word])
emb_移位=类零张量(emb)
emb_shift=张量。设置_子传感器(emb_shift[1],emb[:-1])#将第一个时间点设置为0,其他时间点不变
emb=emb_
cc_u=0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
尺寸=立方形状[2]#尺寸=4
context_mask=theano.shared(context_mask)#shard变量
上下文屏蔽1=上下文屏蔽
掩码=编号共享(掩码)
cc_u=theano.shared(cc_u)
#这些权重矩阵和偏差项分别用于前馈神经网络
参数['ff_state_W']=标准重量(尺寸,尺寸,比例=0.01,正交=真)#
params['ff_state_b']=np.zeros((dim,).astype(theano.config.floatX)
参数['ff_logit_lstm_W']=标准重量(尺寸,尺寸字,比例=0.01,正交=False)
params['ff_logit_lstm_b']=np.zeros((dim_word,).astype(theano.config.floatX)
参数['ff\u logit\u prev\u W']=标准权重(尺寸字、尺寸字、比例=0.01,正交=False)
params['ff_logit_prev_b']=np.zeros((dim_word,).astype(theano.config.floatX)
参数['ff_logit_ctx_W']=标准重量(尺寸,尺寸字,刻度=0.01,正交=False)
params['ff_logit_ctx_b']=np.zeros((dim_word,).astype(theano.config.floatX)
参数['ff_logit_W']=标准权重(尺寸词,n词,比例=0.01,正交=True)
params['ff_logit_b']=np.zeros((n_单词,).astype(theano.config.floatX)
#这些参数将用于scan()中的选通循环单元(GRU)。
参数['W']=np.连接([norm_-weight(dim_-word,dim),norm_-weight(dim_-word,dim)],轴=1)#dim_-word*2dim
params['b']=np.zeros((2*dim,).astype(theano.config.floatX)#2dim
参数['U']=np.连接([ortho_-weight(dim),ortho_-weight(dim)],轴=1)#2个由列连接的svd矩阵(大小:dim*dim)
参数['Wx']=标准权重(dim_-word,dim)#dim_-word*dim
参数['Ux']=正交权重(dim)#svd矩阵(大小:dim*dim)
params['bx']=np.zeros((dim,).astype(theano.config.floatX)#dim
参数['U_nl']=np.连接([正交权重(dim),正交权重(dim)],轴=1)#2个由列连接的svd矩阵(大小:dim*dim)
params['b_nl']=np.zeros((2*dim,).astype(theano.config.floatX)#2dim
参数['Ux_nl']=正交权重(dim)#svd矩阵(大小:dim*dim)
params['bx_nl']=np.zeros((dim,).astype(theano.config.floatX)#dim
参数['Wc']=标准重量(尺寸,尺寸*2)#尺寸*2dim
参数['Wcx']=范数_权重(dim,dim)#奇异值分解矩阵(大小:dim*dim)
参数['W_comb_att']=标准权重(dim,dim)#svd矩阵(大小:dim*dim)
参数['Wc_att']=标准权重(dim)#奇异值分解矩阵(大小:dim*dim)
params['b_att']=np.zeros((dim,).astype(theano.config.floatX)#dim
参数['U_att']=标准重量(尺寸,1)#尺寸*1
params['c_tt']=np.zeros((1,).astype(theano.config.floatX)
tparams=init_tparams(参数)
定义切片(x,n,dim):#切片()对于张量,获取张量的[:,n*dim:(n+1)*dim]
如果x.ndim==3:
返回x[:,:,n*dim:(n+1)*dim]
返回x[:,n*dim:(n+1)*dim]
ctx_mean=(cc_*context_mask[:,:,无]).sum(0)/context_mask.sum(0)[:,无]
init_state=tensor.tanh(tensor.dot(ctx_平均值,tparams['ff_state_W'])+tparams['ff_state_b']))#在扫描切片()的步骤中初始化h_u
下面的状态=张量.dot(emb,tparams['Wx'])+tparams['bx']
#以下状态=emb*tparams['Wx']+tparams['bx']
下面的状态=张量点(emb,tparams['W'])+tparams['b']
#下面的状态=emb*tparams['W']+tparams['b']
定义