Tensorflow 如果我没有'；我不想分批训练，我的状态是一个向量，我的张量对于一个形状应该是什么？_Tensorflow

Tensorflow 如果我没有'；我不想分批训练，我的状态是一个向量，我的张量对于一个形状应该是什么？

tensorflow

Tensorflow 如果我没有'；我不想分批训练，我的状态是一个向量，我的张量对于一个形状应该是什么？,tensorflow,Tensorflow,我试图用tensorflow来解决强化学习问题。我创建了一个自己的健身房环境。状态是一个一维数组（大小224），有170个操作可供选择（0…169）。我不想分批训练。我想做的是用tensorflow运行RL问题的最简单版本我的主要问题是，我猜尺寸。我假设TF允许我将状态输入为1D张量。但是当我想计算W*input=action时，我得到了一个错误。尺寸误差使我们很难知道什么是正确的。此外，web上的示例侧重于批量从图像进行培训一般来说，我是在教程中开始的，但状态的编码方式不同，这又使我很难理

我试图用tensorflow来解决强化学习问题。我创建了一个自己的

健身房

环境。状态是一个一维数组（大小224），有170个操作可供选择（0…169）。我不想分批训练。我想做的是用tensorflow运行RL问题的最简单版本

我的主要问题是，我猜尺寸。我假设TF允许我将状态输入为1D张量。但是当我想计算W*input=action时，我得到了一个错误。尺寸误差使我们很难知道什么是正确的。此外，web上的示例侧重于批量从图像进行培训

一般来说，我是在教程中开始的，但状态的编码方式不同，这又使我很难理解（特别是因为我对python不是很熟悉）

导入健身房
将numpy作为np导入
随机输入
导入tensorflow作为tf
环境=健身房品牌（'MyOwnEnv-v0'）
n_state=224
n_作用=170
sess=tf.InteractiveSession（）
#实现网络本身
inputs1=tf.placeholder（shape=[1，n_state]，dtype=tf.float32）
W=tf.变量（tf.随机均匀（[n_状态，n_动作]，0,0.01））
Qout=tf.transpose（tf.matmul（inputs1，W））
predict=tf.reforme（tf.argmax（Qout，1），[n_action，1]）
#下面，我们通过计算目标值和预测Q值之间的平方差之和来获得损失。
nextQ=tf.placeholder（shape=[n_action，1]，dtype=tf.float32）
损失=tf.reduce_和（tf.square（nextQ-Qout））
培训师=tf.train.GradientDescentOptimizer（学习率=0.1）
updateModel=培训师。最小化（损失）
#培训网络
init=tf.global_variables_initializer（）
打印（“输入：”，输入1.get_shape（）
，“\nW:”，W.get_shape（）
，“\nQout:”，Qout.get_shape（）
，“\n命令：”，预测。获取形状（）
，“\nnextQ:”，nextQ.get_shape（）
，“\n丢失：”，丢失。获取形状（）
#设置学习参数
y=.99
e=0.1
数量=2000
#创建包含每集总奖励和步骤的列表
jList=[]
rList=[]
使用tf.Session（）作为sess：
sess.run（初始化）
对于范围内的i（num_集）：
#重置环境并获得第一个新的观察结果
s=环境重置（）
拉尔=0
d=假
j=0
#Q网络
当j<99时：
j+=1
#从Q-网络中贪婪地选择一个动作（随机动作的概率为e）
a、 allQ=sess.run（[predict，Qout]，feed_dict={inputs1:s}）
如果np.rand.rand（1）

也许它需要一个tf.expand\u dims。将错误消息粘贴到此处。可能需要tf.expand\u dims。将错误消息粘贴到此处。

import gym
import numpy as np
import random
import tensorflow as tf


env = gym.make('MyOwnEnv-v0')
n_state  = 224
n_action = 170

sess = tf.InteractiveSession()

# Implementing the network itself
inputs1 = tf.placeholder(shape=[1,n_state],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([n_state,n_action],0,0.01))
Qout = tf.transpose(tf.matmul(inputs1,W))
predict = tf.reshape(tf.argmax(Qout,1), [n_action,1])

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[n_action,1],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)


# Training the network
init = tf.global_variables_initializer()

print("input:  ", inputs1.get_shape()
      , "\nW:      ", W.get_shape()
      , "\nQout:   ", Qout.get_shape()
      , "\npredict:", predict.get_shape()
      , "\nnextQ:  ", nextQ.get_shape()
      , "\nloss:   ", loss.get_shape()) 


# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Network
        while j < 99:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:s})
            if np.random.rand(1) < e:
                a = env.action_space.sample()
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:s1})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            #targetQ[0,a[0]] = r + y*maxQ1
            targetQ[a,0] = r + y*maxQ1
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel,W],feed_dict={inputs1:s,nextQ:targetQ})
            rAll += r
            s = s1
            if d == True:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)
print('Percent of succesful episodes: ' + str(sum(rList)/num_episodes) + '%')