Python 我无法理解(例如)deepMinds breakout机器人在游戏时是如何学习的
我想在openAIs健身房解决cartpole问题。我的方法是深度Q学习 如果我首先生成一组训练数据,我就能得到一个好分数。然后就这些数据训练我的网络。然后我再次运行我的代理,这次是100%利用率 不同之处在于我的模型没有逐步学习。我必须首先生成训练数据,训练,然后它学习,所以它看起来像:Python 我无法理解(例如)deepMinds breakout机器人在游戏时是如何学习的,python,tensorflow,deep-learning,openai-gym,Python,Tensorflow,Deep Learning,Openai Gym,我想在openAIs健身房解决cartpole问题。我的方法是深度Q学习 如果我首先生成一组训练数据,我就能得到一个好分数。然后就这些数据训练我的网络。然后我再次运行我的代理,这次是100%利用率 不同之处在于我的模型没有逐步学习。我必须首先生成训练数据,训练,然后它学习,所以它看起来像: 生成训练数据->训练->播放 我想做的是: play->train->play better等 按照这种方法,使用完全相同的神经网络和超参数,我的代理没有改进 我的猜测是,我遗漏了一些非常基本的东西。所以我要
生成训练数据->训练->播放
我想做的是:
play->train->play better
等
按照这种方法,使用完全相同的神经网络和超参数,我的代理没有改进
我的猜测是,我遗漏了一些非常基本的东西。所以我要找的是我的逻辑哪里有错
如果你想试试我的全部代码
import gym
from gym import spaces
import numpy as np
import random
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Activation, Dense
# Deep Q-learning Model
class Model:
def __init__(self, environment):
self.env = gym.make(environment)
self.amount_obs, self.amount_actions = self.environment()
# Amount of episodes and steps per episode
self.episodes = 1000
self.gamma = 0.95 # Discount future rate
self.alpha = 0.001 # Learning rate
# Epsilon = explore/exploite factor
self.epsilon = 1.0 # 100% explore initally
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.batch_size = 50
self.model = self.create_net()
def create_net(self):
# Neural-net
model = Sequential()
model.add(Dense(24, input_dim=self.amount_obs, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(self.amount_actions, activation='linear'))
model.compile(loss='mse', optimizer=Adam(lr=self.alpha))
return model
def environment(self):
observation_space = len(self.env.observation_space.sample())
# Space.n gets number of actions
action_space = self.env.action_space.n
return observation_space, action_space
def act(self, state, always_exploit=False):
# Explore or exploite
if random.random() <= self.epsilon and not always_exploit:
# Random action (explore)
return random.randrange(self.amount_actions)
# Predict an action (exploite)
possible_actions = self.model.predict(state)
return np.argmax(possible_actions[0])
def train(self, episode):
for steps in episode:
state, action, reward, next_state, done = steps
if done:
target = reward
else:
target = reward + self.gamma * \
np.amax(self.model.predict(next_state)[0])
target_state = self.model.predict(state)
target_state[0][action] = target
self.model.fit(state, target_state, epochs=2, verbose=0)
def play(self):
for i in range(self.episodes):
state = self.env.reset()
state = np.reshape(state, [1, self.amount_obs])
episode = []
episode_reward = 0
done = False
while not done:
#self.env.render()
# Decide action
action = self.act(state)
next_state, reward, done, _ = self.env.step(action)
next_state = np.reshape(next_state,
[1, self.amount_obs])
# save the previous state, action, reward, and done
episode.append(
(state, action, reward, next_state, done))
state = next_state
episode_reward = episode_reward + reward
print("Episode {} - totals {}".format(i, episode_reward))
# Train the episode
self.train(episode)
# Decrease epsilon
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# initialize gym environment and the agent
model = Model('CartPole-v0')
#model = Model('MountainCar-v0')
#model = Model('Acrobot-v1')
model.play()
导入健身房
从健身房导入空间
将numpy作为np导入
随机输入
从keras.optimizers导入Adam
从keras.models导入顺序
从keras.layers导入激活,密集
#深度Q学习模型
类别模型:
定义初始化(自身、环境):
self.env=gym.make(环境)
self.amount\u obs,self.amount\u actions=self.environment()
#每集的集数和步数
self.com=1000
self.gamma=0.95#贴现未来利率
self.alpha=0.001#学习率
#ε=探索/开发因子
self.epsilon=1.0#100%初始探索
self.epsilon\u min=0.01
自ε衰变=0.995
self.batch_size=50
self.model=self.create_net()
def创建_网络(自):
#神经网络
模型=顺序()
model.add(密集型(24,input_dim=self.amount_obs,activation='relu'))
model.add(密集(24,activation='relu'))
model.add(密集(24,activation='relu'))
model.add(密集(self.amount\u actions,activation='linear'))
compile(loss='mse',optimizer=Adam(lr=self.alpha))
回归模型
def环境(自我):
observation\u space=len(self.env.observation\u space.sample())
#Space.n获取操作数
action\u space=self.env.action\u space.n
返回观察空间,动作空间
def act(自我、状态、始终利用=错误):
#探索或开发
如果random.random()self.epsilon\u min:
self.epsilon*=self.epsilon\u衰变
#初始化gym环境和代理
模型=模型(“CartPole-v0”)
#模型=模型('MountainCar-v0')
#模型=模型(“Acrobot-v1”)
model.play()
I在每集之后根据前几集随机选择的数据进行训练I在每集之后根据前几集随机选择的数据进行训练