Python DQN月球着陆器问题培训,无法学习。未能收敛
嗨,我一直在尝试训练一个DQN代理来解决月球着陆器离散问题,但它无法学习,即使在1000集之后,平均回报也徘徊在-130左右。我也使用了目标网络。请任何人告诉我我做错了什么。。下面是我的代码。(而且代码没有在colab和kaggle的gpu上运行。这就是为什么我必须等待太久才能看到更改的效果…)请帮助我解决问题Python DQN月球着陆器问题培训,无法学习。未能收敛,python,python-3.x,tensorflow,reinforcement-learning,dqn,Python,Python 3.x,Tensorflow,Reinforcement Learning,Dqn,嗨,我一直在尝试训练一个DQN代理来解决月球着陆器离散问题,但它无法学习,即使在1000集之后,平均回报也徘徊在-130左右。我也使用了目标网络。请任何人告诉我我做错了什么。。下面是我的代码。(而且代码没有在colab和kaggle的gpu上运行。这就是为什么我必须等待太久才能看到更改的效果…)请帮助我解决问题 import gym import numpy as np import tensorflow as tf from tensorflow.keras.layers import De
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense , Conv2D , MaxPooling2D , Activation , Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from collections import deque
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import time
env = gym.make("LunarLander-v2")
EPISODES = 3000
DISCOUNT = 0.99
LEARNING_RATE = 0.001
REPLAY_MEMORY_SIZE = 100_000
epsilon = 1
EPSILON_DECAY = 0.996
MIN_MEMORY_SIZE = 50_000
BATCH_SIZE = 128
UPDATE_TARGET_EVERY = 10
SHOW_EVERY = 50
MIN_EPSILON = 0.001
SAVE_EVERY = 100
CHCKPNT = 100
AGGREGATE_STATS_EVERY = 50
MIN_REWARD = -200
MODEL_NAME = "LunarModule"
# MEMORY_FRACTION = 0.2
class Lander:
def __init__(self,):
self.model = self.create_model()
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE )
self.target_update_counter = 0
def create_model(self):
model = Sequential()
model.add(Dense(128,activation="relu" , input_shape = (8,)))
model.add(Dense(128, activation = "relu"))
model.add(Dense(env.action_space.n , activation = "linear"))
model.compile(loss = tf.keras.losses.Huber() , optimizer = Adam(lr = LEARNING_RATE) , metrics = ["accuracy"])
return model
def update_replay_memory(self,transition):
self.replay_memory.append(transition)
def train(self,terminal_state):
if len(self.replay_memory) < MIN_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, BATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index , (current_state,action, reward, new_current_state,done ) in enumerate(minibatch):
if not done:
max_future_q = np.max(new_qs_list[index])
new_q = reward + DISCOUNT*max_future_q
else:
new_q = reward
current_qs = current_qs_list[index]
current_qs[action] = new_q
X.append(current_state)
y.append(current_qs)
self.model.fit(np.array(X), np.array(y),batch_size = BATCH_SIZE, verbose = 0 ,shuffle=False )
# Update target network counter every episode
if terminal_state:
self.target_update_counter += 1
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
# Queries main network for Q values given current observation space (environment state)
def get_qs(self,state):
self.model.predict(np.array(state).reshape(-1, *state.shape))[0]
def save_model(self,ep):
self.model.save(f'./saved_model/agent_{ep}')
def save_checkpoint(self,ckpt):
self.model.save_weights(f"""./checkpoints/my_checkpoint_{ckpt}""")
agent = Lander()
ep_rewards = [-200]
for episode in tqdm(range(1,EPISODES+1) ):
episode_reward = 0
current_state = env.reset()
done = False
step = 0
while not done:
if episode % SHOW_EVERY == 0:
render = True
else:
render = False
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(current_state))
else:
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)
# if render:
# env.render()
episode_reward += reward
agent.update_replay_memory((current_state,action,reward,new_state,done))
agent.train(done)
current_state = new_state
step += 1
ep_rewards.append(episode_reward)
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
print(f"episode:{episode}, average reward:{average_reward}, min reward:{min_reward},max reward: {max_reward}")
# Save model, but only when min reward is greater or equal a set value
if min_reward >= MIN_REWARD:
agent.model.save(f'/content/drive/MyDrive/Models{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)
导入健身房
将numpy作为np导入
导入tensorflow作为tf
从tensorflow.keras.layers导入稠密、Conv2D、MaxPoolig2D、激活、退出、展平
从tensorflow.keras.models导入顺序
从tensorflow.keras.optimizers导入Adam
从集合导入deque
从tqdm.notebook导入tqdm
将matplotlib.pyplot作为plt导入
随机输入
导入时间
env=健身房品牌(“LunarLander-v2”)
剧集=3000
折扣=0.99
学习率=0.001
重播\u内存\u大小=100\u 000
ε=1
εu衰减=0.996
最小内存大小=50\u 000
批量大小=128
每10次更新一次目标
每显示一次=50
最小ε=0.001
每保存一次=100
CHCKPNT=100
聚合统计数据每=50
最低奖励=-200
MODEL_NAME=“LunarModule”
#记忆分数=0.2
类着陆器:
定义初始值(self,):
self.model=self.create_model()
self.target\u model=self.create\u model()
self.target\u model.set\u权重(self.model.get\u权重())
self.replay\u memory=deque(maxlen=replay\u memory\u SIZE)
self.target\u update\u计数器=0
def创建_模型(自):
模型=顺序()
添加(密集型(128,activation=“relu”,input_shape=(8,))
添加(密集(128,activation=“relu”))
model.add(密集(env.action\u space.n,activation=“linear”))
model.compile(loss=tf.keras.loss.Huber(),optimizer=Adam(lr=LEARNING_RATE),metrics=[“accurity”])
回归模型
def更新_回放_内存(自我、转换):
self.replay\u memory.append(转换)
def序列(自身、终端状态):
如果len(self.replay\u memory)update\u target\u EVERY:
self.target\u model.set\u权重(self.model.get\u权重())
self.target\u update\u计数器=0
#在给定当前观测空间(环境状态)的情况下,查询主网络的Q值
def get_qs(自身、状态):
self.model.predict(np.array(state.reformate(-1,*state.shape))[0]
def保存_型号(自我、ep):
self.model.save(f./saved_model/agent_{ep})
def保存检查点(自身、ckpt):
self.model.save_权重(f”“”/checkpoints/my_checkpoint{ckpt}”“)
代理=着陆器()
ep_奖励=[-200]
对于TQM中的情节(范围(1,情节+1)):
第二集奖励=0
当前状态=环境重置()
完成=错误
步长=0
虽然没有这样做:
如果事件%SHOW_EVERY==0:
render=True
其他:
render=False
如果np.random.random()>ε:
action=np.argmax(agent.get_qs(当前状态))
其他:
action=np.random.randint(0,env.action\u space.n)
新状态,奖励,完成,环境步骤(操作)
#如果渲染:
#env.render()
第二集奖励+=奖励
代理。更新\u回放\u内存((当前\u状态、操作、奖励、新\u状态、完成))
代理列车(完成)
当前状态=新状态
步骤+=1
ep_奖励。附加(插曲_奖励)
如果不是每集%AGGREGATE\u STATS\u或每集==1:
平均奖励=总和(ep奖励[-AGGREGATE\u STATS\u EVERY:])/len(ep奖励[-AGGREGATE\u STATS\u EVERY:]))
min_奖励=min(ep_奖励[-聚合_统计值每周一次])
最大奖励=最大(ep奖励[-聚合统计数据每周一次])
打印(f“情节:{情节},平均奖励:{平均奖励},最小奖励:{最小奖励},最大奖励:{最大奖励}”)
#保存模型,但仅当最小奖励大于或等于设定值时
如果最小奖励>=最小奖励:
agent.model.save(f'/content/drive/MyDrive/Models{model_NAME}{max_奖励:{7.2f}max_奖励:{average_奖励:{7.2f}avg_奖励:{min_奖励:{7.2f}min_奖励:{int(time.time())}.model')
如果ε>最小ε:
ε*=ε衰变
ε=最大值(最小ε,ε)