Python TypeError:AdvancedBooleanSubensor.grad非法返回整数值变量。（输入索引0，数据类型int64）_Python_Theano

Python TypeError:AdvancedBooleanSubensor.grad非法返回整数值变量。（输入索引0，数据类型int64）

python

Python TypeError:AdvancedBooleanSubensor.grad非法返回整数值变量。（输入索引0，数据类型int64）,python,theano,Python,Theano,我确实在Theano中遇到了这个问题，主要问题是Theano中的Scan（）运算符，这导致AdvancedBooleanSubensor.grad非法返回了一个整数值变量。如果有任何帮助，我将不胜感激这是我提取的代码。（如果断开的_输入设置为默认的“raise”，则将引发ano.gradient.DisconnectedInputError）。我的Theano版本是：1.0.1 import numpy as np from theano import tensor import theano

我确实在Theano中遇到了这个问题，主要问题是Theano中的Scan（）运算符，这导致AdvancedBooleanSubensor.grad非法返回了一个整数值变量。如果有任何帮助，我将不胜感激

这是我提取的代码。（如果断开的_输入设置为默认的“raise”，则将引发ano.gradient.DisconnectedInputError）。我的Theano版本是：1.0.1

import numpy as np
from theano import tensor
import theano
from collections import OrderedDict

theano.config.floatX = "float32"
profile=False

def ortho_weight(ndim):
    W = np.random.randn(ndim, ndim)
    u, s, v = np.linalg.svd(W)  # Singular value decomposition for matrix W of ndim*ndim，W=US(V*H)
    return u.astype(theano.config.floatX)

def norm_weight(nin, nout=None, scale=0.01, ortho=True):
    if nout is None:
        nout = nin
    if nout == nin and ortho:
        W = ortho_weight(nin)  # Return a nin*nin size Unitary Matrix after singular value decomposition
    else:
        W = scale * np.random.randn(nin, nout)  # randomly generate a numpy matrix of nin*nout, scale all items inside
    return W.astype(theano.config.floatX)


def init_tparams(params):
    tparams = OrderedDict()
    for kk, pp in params.items():
    tparams[kk] = theano.shared(params[kk],name=kk)  # shared variables

    return tparams

def itemlist(tparams):
    return [vv for kk, vv in tparams.items()]

# params
params = OrderedDict()  # Create an ordered dictionary

context_mask = np.array([[1. ,1. ,1. ,1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 1.],
                         [1., 1., 1., 1., 0.],
                         [1., 1., 0., 1., 0.],
                         [1., 0., 0., 0., 0.]]).astype(theano.config.floatX)  # nsteps_src = 7 , n_samples = 5
mask = np.array([[1. ,1. ,1. ,1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 1.],
                 [1., 1., 1., 1., 0.],
                 [1., 1., 0., 1., 0.],
                 [1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # nsteps_trg = 7 , n_samples = 5

D = 1
numPositions = 2*D+1  # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6

params['Wemb_dec'] = norm_weight(n_words, dim_word)  # 35*6
emb = theano.shared(params['Wemb_dec'])  # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])  # Set the first timestps to 0, others no change
emb = emb_shifted

cc_=0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2]  # dim = 4

context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask
mask = theano.shared(mask)
cc_ = theano.shared(cc_)

# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True)  #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)

params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_W'] =  norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)

# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1)  # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX)  # 2dim

params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1)  # 2 svd matrices (size :dim*dim) connected by columns

params['Wx'] = norm_weight(dim_word, dim)  # dim_word*dim

params['Ux'] = ortho_weight(dim)  # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX)  # 2dim

params['Ux_nl'] = ortho_weight(dim)  # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['Wc'] = norm_weight(dim, dim*2)        # dim * 2dim
params['Wcx'] = norm_weight(dim, dim)         # svd matrices (size :dim*dim)

params['W_comb_att'] = norm_weight(dim, dim)  # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim)           # svd matrices (size :dim*dim)

params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX)  # dim

params['U_att'] = norm_weight(dim, 1)  # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)

tparams = init_tparams(params)

def _slice(_x, n, dim):  # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
    if _x.ndim == 3:
        return _x[:, :, n * dim:(n + 1) * dim]
    return _x[:, n * dim:(n + 1) * dim]

ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b'])  # Initialization of h_ in _step_slice of scan()

state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx']   
# state_belowx = emb*tparams['Wx']+tparams['bx']

state_below_ = tensor.dot(emb, tparams['W']) + tparams['b']   
# state_below_ = emb*tparams['W']+tparams['b']

def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1
                Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
                U_nl, Ux_nl, b_nl, bx_nl): 
    preact1 = tensor.dot(h_, U)
    preact1 += x_
    preact1 = tensor.nnet.sigmoid(preact1)

    r1 = _slice(preact1, 0, dim)
    u1 = _slice(preact1, 1, dim)

    preactx1 = tensor.dot(h_, Ux)
    preactx1 *= r1
    preactx1 += xx_

    h1 = tensor.tanh(preactx1)

    h1 = u1 * h_ + (1. - u1) * h1
    h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_  # h1 is source hidden state ,batchsize*dim

    srclen = (context_mask1.sum(0,keepdims=True)- 1).T  # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
    pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
    pctx__ = tensor.dot(pctx__, U_att) + c_tt          # batchsize*1
    pstate_ = srclen * tensor.nnet.sigmoid(pctx__) + 1  # +1：eos is considered
    srcPositions = tensor.floor(pstate_)                # batchsize*1
    srcPositions = tensor.cast(srcPositions, dtype='int32')  # srcPosition is index, so shoud cast to int,like 3.6-->3

    unmaskedId = tensor.flatnonzero(m)          # per timesteps ,take per row element of the source mask matrix  as m ,(batchsize-x)
    srcPositions = srcPositions[unmaskedId,:]   # (batchsize-x)*1
    srclen = srclen[unmaskedId,:]              # (batchsize-x)*1
    startAttnIds = srcPositions - D
    indicesAll = startAttnIds.repeat(numPositions, axis=1)
    indicesAll += tensor.mgrid[0:unmaskedId.shape[0], 0:numPositions][1]  # (batchsize-x)*numPositions
    indicesAll = indicesAll.T.flatten()  # 1*(numPositions*(batchsize-x))

    # Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
    includeIds = (indicesAll <= tensor.tile(srclen, [numPositions, 1]).flatten()) & (indicesAll >= 0)
    indicesAll = indicesAll[includeIds]  # dimensional reduction，1*((numPosition*batchsize-x)-y)

    indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0])  # Scale-out numPositions times,1*(numPosition*batchsize-x)
    indicesSub = indicesSub[includeIds]  # 1*((numPosition*batchsize-x)-y)
    unmaskedIds = tensor.tile(unmaskedId,numPositions)  # Scale-out numPositions times,，1*(numPosition*batchsize-x)
    unmaskedIds = unmaskedIds[includeIds]  # 1*((numPosition*batchsize-x)-y)

    srcVecsSub = tensor.zeros([numPositions*n_samples,dim])   # 15*3
    linearIdSub = indicesSub*n_samples+ unmaskedIds           # 1* ((numPosition*batchsize-x)-y)
    linearIdAll = indicesAll*n_samples+ unmaskedIds          # 1* ((numPosition*batchsize-x)-y)
    cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim])    # 35*3
    srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :],\ 
    cc_[linearIdAll, :])  # numPositions*n_samples*dim
    srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])

    e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att  # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
    e_ij = (h1*e_ij).sum(2)                     # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize

    scaleX = (indicesAll - tensor.tile(pstate_[unmaskedId], (numPositions,1)).flatten())/(D/2)  
    # unmaskedIds.shape[0]/n_samples即((numPosition*batchsize-x)-y) /batchsize
    distWeights = tensor.zeros([numPositions,n_samples])
    distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],\
    scaleX)  # batchSize numPositions

    alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
    alpha = alpha - alpha.max(0)  # subtract max elements
    alpha = tensor.exp(alpha)  # numPositions * batchSize

    context_mask_ = tensor.zeros([numPositions, n_samples])
    context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
    if context_mask_:  # context_mask =x_mask，nsteps_src * batchSize，be truncated to numPositions * batchSize
        alpha = alpha * context_mask_
    alpha_sum = alpha.sum(0,keepdims=True)             # ∑eij
    alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.)  # if alpua_sum = 0 ->1
    alpha = alpha / alpha_sum                          # (numPositions * batchSize),eij/∑eij

    # current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
    ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)

    preact2 = tensor.dot(h1, U_nl) + b_nl
    preact2 += tensor.dot(ctx_, Wc)
    preact2 = tensor.nnet.sigmoid(preact2)

    r2 = _slice(preact2, 0, dim)
    u2 = _slice(preact2, 1, dim)

    preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
    preactx2 *= r2
    preactx2 += tensor.dot(ctx_, Wcx)

    h2 = tensor.tanh(preactx2)

    h2 = u2 * h1 + (1. - u2) * h2
    h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

    return h2, ctx_, alpha.T

seqs = [context_mask, mask, state_below_, state_belowx]

shared_vars = [tparams['Wc_att'],
               tparams['b_att'],
               tparams['U'],
               tparams['Wc'],
               tparams['W_comb_att'],
               tparams['U_att'],
               tparams['c_tt'],
               tparams['Ux'],
               tparams['Wcx'],
               tparams['U_nl'],
               tparams['Ux_nl'],
               tparams['b_nl'],
               tparams['bx_nl']]

rval, updates = theano.scan(_step_slice,
                            sequences=seqs,
                            outputs_info=[init_state,
                                          tensor.alloc(0., n_samples,
                                                       cc_.shape[2]),
                                          tensor.alloc(0., n_samples,
                                                       cc_.shape[0])],
                            non_sequences=[cc_,context_mask1]+shared_vars,
                            name='layers',
                            n_steps=nsteps_trg,
                            profile=profile,
                            strict=True)

proj_h = rval[0]
ctxs = rval[1]

# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+\
tparams['ff_logit_lstm_b']

logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+\
tparams['ff_logit_prev_b']

logit_ctx =tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+\
tparams['ff_logit_ctx_b']

logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)

# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
                                               logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)

# make mean(), tensor->scalar
cost = cost.mean()

print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')

将numpy导入为np
从无输入张量
进口茶
从集合导入订单
theano.config.floatX=“float32”
profile=False
def正交重量（ndim）：
W=np.random.randn（ndim，ndim）
u、 s，v=np.linalg.svd（W）#ndim*ndim的矩阵W的奇异值分解，W=US（v*H）
返回u.astype（theano.config.floatX）
def标准重量（nin，nout=无，刻度=0.01，正交=真）：
如果nout为None：
零=年
如果nout==nin和正交：
W=正交权重（nin）#在奇异值分解后返回nin*nin大小的酉矩阵
其他：
W=标度*np.random.randn（nin，nout）#随机生成nin*nout的numpy矩阵，标度里面的所有项目
返回W.astype（theano.config.floatX）
def初始图（参数）：
tparams=OrderedDict（）
对于kk，params.items（）中的pp：
tparams[kk]=theano.shared（参数[kk]，名称=kk）#共享变量
返回图
def项目列表（TPARMS）：
返回[kk的vv，tparams.items（）中的vv]
#params
params=OrderedDict（）#创建一个有序字典
上下文掩码=np.数组（[[1,1,1,1,1.]，
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1,0,0,0,0.].astype（theano.config.floatX）#nsteps_src=7，n_samples=5
掩码=np.数组（[[1,1,1,1,1.]，
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1,0,0,0,0.].astype（theano.config.floatX）#nsteps_trg=7，n_samples=5
D=1
数量=2*D+1#3
nsteps_src=context_mask.shape[0]
nsteps_trg=掩码.形状[0]
n_samples=context_mask.shape[1]
n_words=nsteps_trg*n_样本
dim_word=6
参数['Wemb_dec']=标准重量（n字，尺寸字）#35*6
emb=theano.shared（参数['Wemb_dec']）#shard变量
emb=emb.重塑（[nsteps\u trg，n\u样本，dim\u word]）
emb_移位=类零张量（emb）
emb_shift=张量。设置_子传感器（emb_shift[1]，emb[：-1]）#将第一个时间点设置为0，其他时间点不变
emb=emb_
cc_u=0.01*np.random.randn（nsteps_src，n_samples，4）.astype（theano.config.floatX）
尺寸=立方形状[2]#尺寸=4
context_mask=theano.shared（context_mask）#shard变量
上下文屏蔽1=上下文屏蔽
掩码=编号共享（掩码）
cc_u=theano.shared（cc_u）
#这些权重矩阵和偏差项分别用于前馈神经网络
参数['ff_state_W']=标准重量（尺寸，尺寸，比例=0.01，正交=真）#
params['ff_state_b']=np.zeros（（dim，）.astype（theano.config.floatX）
参数['ff_logit_lstm_W']=标准重量（尺寸，尺寸字，比例=0.01，正交=False）
params['ff_logit_lstm_b']=np.zeros（（dim_word，）.astype（theano.config.floatX）
参数['ff\u logit\u prev\u W']=标准权重（尺寸字、尺寸字、比例=0.01，正交=False）
params['ff_logit_prev_b']=np.zeros（（dim_word，）.astype（theano.config.floatX）
参数['ff_logit_ctx_W']=标准重量（尺寸，尺寸字，刻度=0.01，正交=False）
params['ff_logit_ctx_b']=np.zeros（（dim_word，）.astype（theano.config.floatX）
参数['ff_logit_W']=标准权重（尺寸词，n词，比例=0.01，正交=True）
params['ff_logit_b']=np.zeros（（n_单词，）.astype（theano.config.floatX）
#这些参数将用于scan（）中的选通循环单元（GRU）。
参数['W']=np.连接（[norm_-weight（dim_-word，dim），norm_-weight（dim_-word，dim）]，轴=1）#dim_-word*2dim
params['b']=np.zeros（（2*dim，）.astype（theano.config.floatX）#2dim
参数['U']=np.连接（[ortho_-weight（dim），ortho_-weight（dim）]，轴=1）#2个由列连接的svd矩阵（大小：dim*dim）
参数['Wx']=标准权重（dim_-word，dim）#dim_-word*dim
参数['Ux']=正交权重（dim）#svd矩阵（大小：dim*dim）
params['bx']=np.zeros（（dim，）.astype（theano.config.floatX）#dim
参数['U_nl']=np.连接（[正交权重（dim），正交权重（dim）]，轴=1）#2个由列连接的svd矩阵（大小：dim*dim）
params['b_nl']=np.zeros（（2*dim，）.astype（theano.config.floatX）#2dim
参数['Ux_nl']=正交权重（dim）#svd矩阵（大小：dim*dim）
params['bx_nl']=np.zeros（（dim，）.astype（theano.config.floatX）#dim
参数['Wc']=标准重量（尺寸，尺寸*2）#尺寸*2dim
参数['Wcx']=范数_权重（dim，dim）#奇异值分解矩阵（大小：dim*dim）
参数['W_comb_att']=标准权重（dim，dim）#svd矩阵（大小：dim*dim）
参数['Wc_att']=标准权重（dim）#奇异值分解矩阵（大小：dim*dim）
params['b_att']=np.zeros（（dim，）.astype（theano.config.floatX）#dim
参数['U_att']=标准重量（尺寸，1）#尺寸*1
params['c_tt']=np.zeros（（1，）.astype（theano.config.floatX）
tparams=init_tparams（参数）
定义切片（x，n，dim）：#切片（）对于张量，获取张量的[：，n*dim:（n+1）*dim]
如果x.ndim==3：
返回x[：，：，n*dim:（n+1）*dim]
返回x[：，n*dim:（n+1）*dim]
ctx_mean=（cc_*context_mask[：，：，无]）.sum（0）/context_mask.sum（0）[：，无]
init_state=tensor.tanh（tensor.dot（ctx_平均值，tparams['ff_state_W']）+tparams['ff_state_b']））#在扫描切片（）的步骤中初始化h_u
下面的状态=张量.dot（emb，tparams['Wx']）+tparams['bx']
#以下状态=emb*tparams['Wx']+tparams['bx']
下面的状态=张量点（emb，tparams['W']）+tparams['b']
#下面的状态=emb*tparams['W']+tparams['b']
定义