Python 3.x 近端策略梯度张量流摆问题 导入健身房 将numpy作为np导入 导入tensorflow作为tf 类内存(对象): 定义初始化(自): self.ep_obs、self.ep_act、self.ep_rwd、self.ep_neglogp=[]、[]、[]、[]、[] def存储转换(自身、obs0、act、rwd、neglogp): self.ep_obs.append(obs0) self.ep_act.append(act) self.ep_rwd.append(rwd) self.ep_neglogp.append(neglogp) def隐蔽到阵列(自): 数组_obs=np.vstack(self.ep_obs) array\u act=np.vstack(self.ep\u act) array\u rwd=np.array(self.ep\u rwd) array_neglogp=np.array(self.ep_neglogp).square(axis=1) 返回数组\ obs、数组\ act、数组\ rwd、数组\ neglogp def重置(自): self.ep_obs、self.ep_act、self.ep_rwd、self.ep_neglogp=[]、[]、[]、[]、[] 类ActorNetwork(对象): 定义初始(自我、行为、名称): self.act\u dim=act\u dim self.name=名称 def步骤(自我、obs、重用): 使用tf.variable_scope(self.name,reuse=reuse): h1=tf.layers.density(obs,100,激活=tf.nn.relu) mu=2*tf.layers.density(h1,self.act\u dim,activation=tf.nn.tanh) sigma=tf.layers.density(h1,self.act\u dim,activation=tf.nn.softplus) pd=tf.分布。正态(loc=mu,标度=sigma) 返回pd def choose_操作(自身、obs、重用=False): pd=自身步骤(obs、重用) 作用=tf.挤压(局部采样(1),轴=0) action=tf.clip_by_值(action,-2,2) 返回动作 def get_neglogp(self、obs、act、reuse=False): pd=自身步骤(obs、重用) logp=pd.log\u prob(act) 返回日志 类值网络(对象): 定义初始化(self,name): self.name=名称 def步骤(自我、obs、重用): 使用tf.variable_scope(self.name,reuse=reuse): h1=tf.layers.density(输入=obs,单位=100,激活=tf.nn.relu) 值=tf.layers.density(输入=h1,单位=1) 返回值 def get_值(self、obs、reuse=False): 值=自身步骤(obs、重用) 返回值 类PPO(对象): 定义初始值(自身、动作尺寸、obs尺寸、lr尺寸、lr尺寸、lr尺寸值、伽马、剪辑范围): self.act\u dim=act\u dim self.obs\u dim=obs\u dim self.lr\u actor=lr\u actor self.lr_值=lr_值 self.gamma=gamma self.clip\u range=clip\u range self.OBS=tf.placeholder(tf.float32,[None,self.OBS_dim],name=“observation”) self.ACT=tf.placeholder(tf.float32,[None,self.ACT\u dim],name=“action”) self.Q_VAL=tf.placeholder(tf.float32,[None,1],name=“Q_value”) self.ADV=tf.placeholder(tf.float32,[None,1],name=“advantage”) self.NEGLOGP=tf.placeholder(tf.float32,[None,1],name=“old_NEGLOGP”) actor=ActorNetwork(self.act\u dim,“actor”) value=ValueNetwork(‘批评家’) self.memory=memory() self.action=actor.choose\u action(self.OBS) self.neglogp=actor.get_neglogp(self.OBS,self.ACT,reuse=True) 比率=tf.math.exp(self.neglogp-self.neglogp) clip_比率=tf.clip_乘以_值(比率,1.-self.clip_范围,1.+self.clip_范围) 参与者损失=-tf.reduce平均值((tf.minimum(比率*self.ADV,剪辑比率*self.ADV))) self.actor\u train\u op=tf.train.AdamOptimizer(self.lr\u actor)。最小化(actor\u损失) self.value=value.get_值(self.OBS) self.advantage=self.Q_VAL-self.value 价值损失=tf.减少平均值(tf.平方(自身优势)) self.value\u train\u op=tf.train.AdamOptimizer(self.lr\u值)。最小化(值损失) self.sess=tf.Session() self.sess.run(tf.global\u variables\u initializer()) def步骤(自身、obs): 如果obs.ndim

Python 3.x 近端策略梯度张量流摆问题 导入健身房 将numpy作为np导入 导入tensorflow作为tf 类内存(对象): 定义初始化(自): self.ep_obs、self.ep_act、self.ep_rwd、self.ep_neglogp=[]、[]、[]、[]、[] def存储转换(自身、obs0、act、rwd、neglogp): self.ep_obs.append(obs0) self.ep_act.append(act) self.ep_rwd.append(rwd) self.ep_neglogp.append(neglogp) def隐蔽到阵列(自): 数组_obs=np.vstack(self.ep_obs) array\u act=np.vstack(self.ep\u act) array\u rwd=np.array(self.ep\u rwd) array_neglogp=np.array(self.ep_neglogp).square(axis=1) 返回数组\ obs、数组\ act、数组\ rwd、数组\ neglogp def重置(自): self.ep_obs、self.ep_act、self.ep_rwd、self.ep_neglogp=[]、[]、[]、[]、[] 类ActorNetwork(对象): 定义初始(自我、行为、名称): self.act\u dim=act\u dim self.name=名称 def步骤(自我、obs、重用): 使用tf.variable_scope(self.name,reuse=reuse): h1=tf.layers.density(obs,100,激活=tf.nn.relu) mu=2*tf.layers.density(h1,self.act\u dim,activation=tf.nn.tanh) sigma=tf.layers.density(h1,self.act\u dim,activation=tf.nn.softplus) pd=tf.分布。正态(loc=mu,标度=sigma) 返回pd def choose_操作(自身、obs、重用=False): pd=自身步骤(obs、重用) 作用=tf.挤压(局部采样(1),轴=0) action=tf.clip_by_值(action,-2,2) 返回动作 def get_neglogp(self、obs、act、reuse=False): pd=自身步骤(obs、重用) logp=pd.log\u prob(act) 返回日志 类值网络(对象): 定义初始化(self,name): self.name=名称 def步骤(自我、obs、重用): 使用tf.variable_scope(self.name,reuse=reuse): h1=tf.layers.density(输入=obs,单位=100,激活=tf.nn.relu) 值=tf.layers.density(输入=h1,单位=1) 返回值 def get_值(self、obs、reuse=False): 值=自身步骤(obs、重用) 返回值 类PPO(对象): 定义初始值(自身、动作尺寸、obs尺寸、lr尺寸、lr尺寸、lr尺寸值、伽马、剪辑范围): self.act\u dim=act\u dim self.obs\u dim=obs\u dim self.lr\u actor=lr\u actor self.lr_值=lr_值 self.gamma=gamma self.clip\u range=clip\u range self.OBS=tf.placeholder(tf.float32,[None,self.OBS_dim],name=“observation”) self.ACT=tf.placeholder(tf.float32,[None,self.ACT\u dim],name=“action”) self.Q_VAL=tf.placeholder(tf.float32,[None,1],name=“Q_value”) self.ADV=tf.placeholder(tf.float32,[None,1],name=“advantage”) self.NEGLOGP=tf.placeholder(tf.float32,[None,1],name=“old_NEGLOGP”) actor=ActorNetwork(self.act\u dim,“actor”) value=ValueNetwork(‘批评家’) self.memory=memory() self.action=actor.choose\u action(self.OBS) self.neglogp=actor.get_neglogp(self.OBS,self.ACT,reuse=True) 比率=tf.math.exp(self.neglogp-self.neglogp) clip_比率=tf.clip_乘以_值(比率,1.-self.clip_范围,1.+self.clip_范围) 参与者损失=-tf.reduce平均值((tf.minimum(比率*self.ADV,剪辑比率*self.ADV))) self.actor\u train\u op=tf.train.AdamOptimizer(self.lr\u actor)。最小化(actor\u损失) self.value=value.get_值(self.OBS) self.advantage=self.Q_VAL-self.value 价值损失=tf.减少平均值(tf.平方(自身优势)) self.value\u train\u op=tf.train.AdamOptimizer(self.lr\u值)。最小化(值损失) self.sess=tf.Session() self.sess.run(tf.global\u variables\u initializer()) def步骤(自身、obs): 如果obs.ndim,python-3.x,tensorflow,reinforcement-learning,agent,openai-gym,Python 3.x,Tensorflow,Reinforcement Learning,Agent,Openai Gym,我已经在tensorflow、Environment Pe摆v0上实施了近端策略优化 如果上面有什么问题,告诉我,为什么我的同事 import gym import numpy as np import tensorflow as tf class Memory(object): def __init__(self): self.ep_obs, self.ep_act, self.ep_rwd, self.ep_neglogp = [], [],

我已经在tensorflow、Environment Pe摆v0上实施了近端策略优化

  • 如果上面有什么问题,告诉我,为什么我的同事
    import gym  
    import numpy as np  
    import tensorflow as tf
    
    
    class Memory(object):  
        
        def __init__(self):
            self.ep_obs, self.ep_act, self.ep_rwd, self.ep_neglogp = [], [], [], []
    
        def store_transition(self, obs0, act, rwd, neglogp):
            self.ep_obs.append(obs0)
            self.ep_act.append(act)
            self.ep_rwd.append(rwd)
            self.ep_neglogp.append(neglogp)
    
        def covert_to_array(self):
            array_obs = np.vstack(self.ep_obs)
            array_act = np.vstack(self.ep_act)
            array_rwd = np.array(self.ep_rwd)
            array_neglogp = np.array(self.ep_neglogp).squeeze(axis=1)
            return array_obs, array_act, array_rwd, array_neglogp
    
        def reset(self):
            self.ep_obs, self.ep_act, self.ep_rwd, self.ep_neglogp = [], [], [], []
    
    
    class ActorNetwork(object):
        
        def __init__(self, act_dim, name):
            self.act_dim = act_dim
            self.name = name
    
        def step(self, obs, reuse):
            with tf.variable_scope(self.name, reuse=reuse):
                h1 = tf.layers.dense(obs, 100, activation=tf.nn.relu)
                mu = 2 * tf.layers.dense(h1, self.act_dim, activation=tf.nn.tanh)
                sigma = tf.layers.dense(h1, self.act_dim, activation=tf.nn.softplus)
                pd = tf.distributions.Normal(loc=mu, scale=sigma)
            return pd
    
        def choose_action(self, obs, reuse=False):
            pd = self.step(obs, reuse)
            action = tf.squeeze(pd.sample(1), axis=0)
            action = tf.clip_by_value(action, -2, 2)
            return action
    
        def get_neglogp(self, obs, act, reuse=False):
            pd = self.step(obs, reuse)
            logp = pd.log_prob(act)
            return logp
    
    
    class ValueNetwork(object):
        
        def __init__(self, name):
            self.name = name
    
        def step(self, obs, reuse):
            with tf.variable_scope(self.name, reuse=reuse):
                h1 = tf.layers.dense(inputs=obs, units=100, activation=tf.nn.relu)
                value = tf.layers.dense(inputs=h1, units=1)
                return value
    
        def get_value(self, obs, reuse=False):
            value = self.step(obs, reuse)
            return value
    
    
    class PPO(object):
        
        def __init__(self, act_dim, obs_dim, lr_actor, lr_value, gamma, clip_range):
            self.act_dim = act_dim
            self.obs_dim = obs_dim
            self.lr_actor = lr_actor
            self.lr_value = lr_value
            self.gamma = gamma
            self.clip_range = clip_range
    
            self.OBS = tf.placeholder(tf.float32, [None, self.obs_dim], name="observation")
            self.ACT = tf.placeholder(tf.float32, [None, self.act_dim], name="action")
            self.Q_VAL = tf.placeholder(tf.float32, [None, 1], name="q_value")
            self.ADV = tf.placeholder(tf.float32, [None, 1], name="advantage")
            self.NEGLOGP = tf.placeholder(tf.float32, [None, 1], name="old_neglogp")
    
            actor = ActorNetwork(self.act_dim, 'actor')
            value = ValueNetwork('critic')
            self.memory = Memory()
    
            self.action = actor.choose_action(self.OBS)
            self.neglogp = actor.get_neglogp(self.OBS, self.ACT, reuse=True)
            ratio = tf.math.exp(self.neglogp - self.NEGLOGP)
            clip_ratio = tf.clip_by_value(ratio, 1. - self.clip_range, 1. + self.clip_range)
            actor_loss = -tf.reduce_mean((tf.minimum(ratio* self.ADV, clip_ratio* self.ADV)) )
            self.actor_train_op = tf.train.AdamOptimizer(self.lr_actor).minimize(actor_loss)
    
            self.value = value.get_value(self.OBS)
            self.advantage = self.Q_VAL - self.value
            value_loss = tf.reduce_mean(tf.square(self.advantage))
            self.value_train_op = tf.train.AdamOptimizer(self.lr_value).minimize(value_loss)
    
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
    
        def step(self, obs):
            if obs.ndim < 2: obs = obs[np.newaxis, :]
            action = self.sess.run(self.action, feed_dict={self.OBS: obs})
            action = np.squeeze(action, 1).clip(-2, 2)
    
            neglogp = self.sess.run(self.neglogp, feed_dict={self.OBS: obs, self.ACT: action[np.newaxis, :]})
    
            value = self.sess.run(self.value, feed_dict={self.OBS: obs})
            value = np.squeeze(value, 1).squeeze(0)
            return action, neglogp, value
    
        def learn(self, last_value, done):
            obs, act, rwd, neglogp = self.memory.covert_to_array()
            rwd = (rwd - rwd.mean()) / (rwd.std() + 1e-5)
            q_value = self.compute_q_value(last_value, obs, rwd)
    
            adv = self.sess.run(self.advantage, {self.OBS: obs, self.Q_VAL: q_value})
    
            [self.sess.run(self.value_train_op,
                           {self.OBS: obs, self.Q_VAL: q_value}) for _ in range(10)]
            [self.sess.run(self.actor_train_op,
                              {self.OBS: obs, self.ACT: act, self.ADV: adv, self.NEGLOGP: neglogp}) for _ in range(10)]
    
            self.memory.reset()
    
        def compute_q_value(self, last_value, obs, rwd):
            q_value = np.zeros_like(rwd)
            value = self.sess.run(self.value, feed_dict={self.OBS: obs})
            for t in reversed(range(0, len(rwd)-1)):
                q_value[t] = value[t+1] * self.gamma + rwd[t]
            return q_value[:, np.newaxis]
    
    
    env = gym.make('Pendulum-v0')
    env.seed(1)
    env = env.unwrapped
    
    agent = PPO(act_dim=env.action_space.shape[0], obs_dim=env.observation_space.shape[0],
                lr_actor=0.0004, lr_value=0.0003, gamma=0.9, clip_range=0.2)
    
    nepisode = 1000
    nstep = 200
    
    for i_episode in range(nepisode):
        obs0 = env.reset()
        ep_rwd = 0
    
        for t in range(nstep):
            act, neglogp, _ = agent.step(obs0)
            obs1, rwd, done, _ = env.step(act)
    
            agent.memory.store_transition(obs0, act, rwd, neglogp)
    
            obs0 = obs1
            ep_rwd += rwd
    
            if (t + 1) % 32 == 0 or t == nstep - 1:
                _, _, last_value = agent.step(obs1)
                agent.learn(last_value, done)
    
        print('Ep: %i' % i_episode, "|Ep_r: %i" % ep_rwd)