Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/292.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 我无法理解(例如)deepMinds breakout机器人在游戏时是如何学习的_Python_Tensorflow_Deep Learning_Openai Gym - Fatal编程技术网

Python 我无法理解(例如)deepMinds breakout机器人在游戏时是如何学习的

Python 我无法理解(例如)deepMinds breakout机器人在游戏时是如何学习的,python,tensorflow,deep-learning,openai-gym,Python,Tensorflow,Deep Learning,Openai Gym,我想在openAIs健身房解决cartpole问题。我的方法是深度Q学习 如果我首先生成一组训练数据,我就能得到一个好分数。然后就这些数据训练我的网络。然后我再次运行我的代理,这次是100%利用率 不同之处在于我的模型没有逐步学习。我必须首先生成训练数据,训练,然后它学习,所以它看起来像: 生成训练数据->训练->播放 我想做的是: play->train->play better等 按照这种方法,使用完全相同的神经网络和超参数,我的代理没有改进 我的猜测是,我遗漏了一些非常基本的东西。所以我要

我想在openAIs健身房解决cartpole问题。我的方法是深度Q学习

如果我首先生成一组训练数据,我就能得到一个好分数。然后就这些数据训练我的网络。然后我再次运行我的代理,这次是100%利用率

不同之处在于我的模型没有逐步学习。我必须首先生成训练数据,训练,然后它学习,所以它看起来像:

生成训练数据->训练->播放

我想做的是:

play->train->play better

按照这种方法,使用完全相同的神经网络和超参数,我的代理没有改进

我的猜测是,我遗漏了一些非常基本的东西。所以我要找的是我的逻辑哪里有错

如果你想试试我的全部代码

import gym
from gym import spaces
import numpy as np
import random
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Activation, Dense

# Deep Q-learning Model
class Model:
    def __init__(self, environment):
        self.env = gym.make(environment)
        self.amount_obs, self.amount_actions = self.environment()

        # Amount of episodes and steps per episode
        self.episodes = 1000

        self.gamma = 0.95  # Discount future rate
        self.alpha = 0.001 # Learning rate

        # Epsilon = explore/exploite factor
        self.epsilon       = 1.0    # 100% explore initally
        self.epsilon_min   = 0.01
        self.epsilon_decay = 0.995

        self.batch_size = 50
        self.model      = self.create_net()

    def create_net(self):
        # Neural-net
        model = Sequential()
        model.add(Dense(24, input_dim=self.amount_obs, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.amount_actions, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.alpha))
        return model

    def environment(self):
        observation_space = len(self.env.observation_space.sample())
        # Space.n gets number of actions
        action_space = self.env.action_space.n
        return observation_space, action_space

    def act(self, state, always_exploit=False):
        # Explore or exploite
        if random.random() <= self.epsilon and not always_exploit:
            # Random action (explore)
            return random.randrange(self.amount_actions)
        # Predict an action (exploite)
        possible_actions = self.model.predict(state)
        return np.argmax(possible_actions[0])

    def train(self, episode):
        for steps in episode:
            state, action, reward, next_state, done = steps

            if done:
                target = reward
            else:
                target = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])

            target_state = self.model.predict(state)
            target_state[0][action] = target
            self.model.fit(state, target_state, epochs=2, verbose=0)

    def play(self):
        for i in range(self.episodes):

            state = self.env.reset()
            state = np.reshape(state, [1, self.amount_obs])
            episode = []
            episode_reward = 0
            done = False

            while not done:

                #self.env.render()

                # Decide action
                action = self.act(state)

                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state,
                             [1, self.amount_obs])

                # save the previous state, action, reward, and done
                episode.append(
                        (state, action, reward, next_state, done))

                state = next_state

                episode_reward = episode_reward + reward

            print("Episode {} - totals {}".format(i, episode_reward))
            # Train the episode
            self.train(episode)

        # Decrease epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# initialize gym environment and the agent
model = Model('CartPole-v0')
#model = Model('MountainCar-v0')
#model = Model('Acrobot-v1')
model.play()
导入健身房
从健身房导入空间
将numpy作为np导入
随机输入
从keras.optimizers导入Adam
从keras.models导入顺序
从keras.layers导入激活,密集
#深度Q学习模型
类别模型:
定义初始化(自身、环境):
self.env=gym.make(环境)
self.amount\u obs,self.amount\u actions=self.environment()
#每集的集数和步数
self.com=1000
self.gamma=0.95#贴现未来利率
self.alpha=0.001#学习率
#ε=探索/开发因子
self.epsilon=1.0#100%初始探索
self.epsilon\u min=0.01
自ε衰变=0.995
self.batch_size=50
self.model=self.create_net()
def创建_网络(自):
#神经网络
模型=顺序()
model.add(密集型(24,input_dim=self.amount_obs,activation='relu'))
model.add(密集(24,activation='relu'))
model.add(密集(24,activation='relu'))
model.add(密集(self.amount\u actions,activation='linear'))
compile(loss='mse',optimizer=Adam(lr=self.alpha))
回归模型
def环境(自我):
observation\u space=len(self.env.observation\u space.sample())
#Space.n获取操作数
action\u space=self.env.action\u space.n
返回观察空间,动作空间
def act(自我、状态、始终利用=错误):
#探索或开发
如果random.random()self.epsilon\u min:
self.epsilon*=self.epsilon\u衰变
#初始化gym环境和代理
模型=模型(“CartPole-v0”)
#模型=模型('MountainCar-v0')
#模型=模型(“Acrobot-v1”)
model.play()

I在每集之后根据前几集随机选择的数据进行训练I在每集之后根据前几集随机选择的数据进行训练