Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/361.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python DQN月球着陆器问题培训,无法学习。未能收敛_Python_Python 3.x_Tensorflow_Reinforcement Learning_Dqn - Fatal编程技术网

Python DQN月球着陆器问题培训,无法学习。未能收敛

Python DQN月球着陆器问题培训,无法学习。未能收敛,python,python-3.x,tensorflow,reinforcement-learning,dqn,Python,Python 3.x,Tensorflow,Reinforcement Learning,Dqn,嗨,我一直在尝试训练一个DQN代理来解决月球着陆器离散问题,但它无法学习,即使在1000集之后,平均回报也徘徊在-130左右。我也使用了目标网络。请任何人告诉我我做错了什么。。下面是我的代码。(而且代码没有在colab和kaggle的gpu上运行。这就是为什么我必须等待太久才能看到更改的效果…)请帮助我解决问题 import gym import numpy as np import tensorflow as tf from tensorflow.keras.layers import De

嗨,我一直在尝试训练一个DQN代理来解决月球着陆器离散问题,但它无法学习,即使在1000集之后,平均回报也徘徊在-130左右。我也使用了目标网络。请任何人告诉我我做错了什么。。下面是我的代码。(而且代码没有在colab和kaggle的gpu上运行。这就是为什么我必须等待太久才能看到更改的效果…)请帮助我解决问题

import gym
import numpy as np 
import tensorflow as tf
from tensorflow.keras.layers import Dense , Conv2D , MaxPooling2D , Activation , Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from collections import deque
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt 
import random
import time


env = gym.make("LunarLander-v2")

EPISODES = 3000
DISCOUNT = 0.99
LEARNING_RATE = 0.001
REPLAY_MEMORY_SIZE = 100_000
epsilon = 1
EPSILON_DECAY = 0.996
MIN_MEMORY_SIZE = 50_000
BATCH_SIZE = 128
UPDATE_TARGET_EVERY = 10
SHOW_EVERY = 50
MIN_EPSILON = 0.001
SAVE_EVERY = 100
CHCKPNT = 100
AGGREGATE_STATS_EVERY = 50
MIN_REWARD = -200
MODEL_NAME = "LunarModule"
# MEMORY_FRACTION = 0.2



class Lander:

    def __init__(self,):

        self.model = self.create_model()

        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())

        self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE )

        self.target_update_counter = 0

    
    def create_model(self):

        model = Sequential()

        model.add(Dense(128,activation="relu" , input_shape = (8,)))
        model.add(Dense(128, activation = "relu"))
        model.add(Dense(env.action_space.n , activation = "linear"))

        model.compile(loss = tf.keras.losses.Huber() , optimizer = Adam(lr = LEARNING_RATE) , metrics = ["accuracy"])
        return model

    def update_replay_memory(self,transition):
        self.replay_memory.append(transition)


    def train(self,terminal_state):

        if len(self.replay_memory) < MIN_MEMORY_SIZE:
            return 

        minibatch = random.sample(self.replay_memory, BATCH_SIZE)

        current_states = np.array([transition[0] for transition in minibatch])
        current_qs_list = self.model.predict(current_states)

        new_current_states = np.array([transition[3] for transition in minibatch])
        new_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        for index , (current_state,action, reward, new_current_state,done ) in enumerate(minibatch):

            if not done:
                max_future_q = np.max(new_qs_list[index])
                new_q = reward + DISCOUNT*max_future_q

            else:
                new_q = reward

            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            X.append(current_state)
            y.append(current_qs)

        self.model.fit(np.array(X), np.array(y),batch_size = BATCH_SIZE, verbose = 0 ,shuffle=False )

        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1
        
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            
            self.target_update_counter = 0

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self,state):
        self.model.predict(np.array(state).reshape(-1, *state.shape))[0]
    
    def save_model(self,ep):
        self.model.save(f'./saved_model/agent_{ep}')
        
    def save_checkpoint(self,ckpt):
        self.model.save_weights(f"""./checkpoints/my_checkpoint_{ckpt}""")


agent = Lander()
ep_rewards = [-200]

for episode in tqdm(range(1,EPISODES+1) ):

    episode_reward = 0
    
    current_state = env.reset()

    done = False

    step = 0
    while not done:
        if episode % SHOW_EVERY == 0:
            render = True
        else:
            render = False
        if np.random.random() > epsilon:

            action = np.argmax(agent.get_qs(current_state))

        else:
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
#         if render:
#             env.render()
        episode_reward += reward

        agent.update_replay_memory((current_state,action,reward,new_state,done))
        agent.train(done)

        current_state = new_state
        step += 1
    ep_rewards.append(episode_reward)

    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        print(f"episode:{episode}, average reward:{average_reward}, min reward:{min_reward},max reward: {max_reward}")
        # Save model, but only when min reward is greater or equal a set value
        if min_reward >= MIN_REWARD:
            agent.model.save(f'/content/drive/MyDrive/Models{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')


    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)



                       
    




                       
    
导入健身房
将numpy作为np导入
导入tensorflow作为tf
从tensorflow.keras.layers导入稠密、Conv2D、MaxPoolig2D、激活、退出、展平
从tensorflow.keras.models导入顺序
从tensorflow.keras.optimizers导入Adam
从集合导入deque
从tqdm.notebook导入tqdm
将matplotlib.pyplot作为plt导入
随机输入
导入时间
env=健身房品牌(“LunarLander-v2”)
剧集=3000
折扣=0.99
学习率=0.001
重播\u内存\u大小=100\u 000
ε=1
εu衰减=0.996
最小内存大小=50\u 000
批量大小=128
每10次更新一次目标
每显示一次=50
最小ε=0.001
每保存一次=100
CHCKPNT=100
聚合统计数据每=50
最低奖励=-200
MODEL_NAME=“LunarModule”
#记忆分数=0.2
类着陆器:
定义初始值(self,):
self.model=self.create_model()
self.target\u model=self.create\u model()
self.target\u model.set\u权重(self.model.get\u权重())
self.replay\u memory=deque(maxlen=replay\u memory\u SIZE)
self.target\u update\u计数器=0
def创建_模型(自):
模型=顺序()
添加(密集型(128,activation=“relu”,input_shape=(8,))
添加(密集(128,activation=“relu”))
model.add(密集(env.action\u space.n,activation=“linear”))
model.compile(loss=tf.keras.loss.Huber(),optimizer=Adam(lr=LEARNING_RATE),metrics=[“accurity”])
回归模型
def更新_回放_内存(自我、转换):
self.replay\u memory.append(转换)
def序列(自身、终端状态):
如果len(self.replay\u memory)update\u target\u EVERY:
self.target\u model.set\u权重(self.model.get\u权重())
self.target\u update\u计数器=0
#在给定当前观测空间(环境状态)的情况下,查询主网络的Q值
def get_qs(自身、状态):
self.model.predict(np.array(state.reformate(-1,*state.shape))[0]
def保存_型号(自我、ep):
self.model.save(f./saved_model/agent_{ep})
def保存检查点(自身、ckpt):
self.model.save_权重(f”“”/checkpoints/my_checkpoint{ckpt}”“)
代理=着陆器()
ep_奖励=[-200]
对于TQM中的情节(范围(1,情节+1)):
第二集奖励=0
当前状态=环境重置()
完成=错误
步长=0
虽然没有这样做:
如果事件%SHOW_EVERY==0:
render=True
其他:
render=False
如果np.random.random()>ε:
action=np.argmax(agent.get_qs(当前状态))
其他:
action=np.random.randint(0,env.action\u space.n)
新状态,奖励,完成,环境步骤(操作)
#如果渲染:
#env.render()
第二集奖励+=奖励
代理。更新\u回放\u内存((当前\u状态、操作、奖励、新\u状态、完成))
代理列车(完成)
当前状态=新状态
步骤+=1
ep_奖励。附加(插曲_奖励)
如果不是每集%AGGREGATE\u STATS\u或每集==1:
平均奖励=总和(ep奖励[-AGGREGATE\u STATS\u EVERY:])/len(ep奖励[-AGGREGATE\u STATS\u EVERY:]))
min_奖励=min(ep_奖励[-聚合_统计值每周一次])
最大奖励=最大(ep奖励[-聚合统计数据每周一次])
打印(f“情节:{情节},平均奖励:{平均奖励},最小奖励:{最小奖励},最大奖励:{最大奖励}”)
#保存模型,但仅当最小奖励大于或等于设定值时
如果最小奖励>=最小奖励:
agent.model.save(f'/content/drive/MyDrive/Models{model_NAME}{max_奖励:{7.2f}max_奖励:{average_奖励:{7.2f}avg_奖励:{min_奖励:{7.2f}min_奖励:{int(time.time())}.model')
如果ε>最小ε:
ε*=ε衰变
ε=最大值(最小ε,ε)