Python 使用pytorch NN和Cartpole问题最小化分数

Python 使用pytorch NN和Cartpole问题最小化分数,python,pytorch,openai-gym,Python,Pytorch,Openai Gym,我试图通过在pytorch中训练一个简单的2层NN来解决openAI健身房中的CartPole问题。所使用的方法是DQN,但结果收敛于8或9分左右的最高分数,并且没有随着时间或训练而改善。相反,随着训练,分数会越来越低。如何改进这一点/让它这样做的代码中有什么错误?以下是使用的代码: import gym import random import math import torch import torch.nn as nn import torch.optim as optim from to

我试图通过在pytorch中训练一个简单的2层NN来解决openAI健身房中的CartPole问题。所使用的方法是DQN,但结果收敛于8或9分左右的最高分数,并且没有随着时间或训练而改善。相反,随着训练,分数会越来越低。如何改进这一点/让它这样做的代码中有什么错误?以下是使用的代码:

import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np

class network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        # network takes 4 inputs (state, action, next_state, reward), hidden layer then has
        # 256 inputs and the network has 2 outputs (the q value of going left or right)
        # in this network the index of the output references the action.
        self.l1 = nn.Linear(4, 256)
        self.l2 = nn.Linear(256, 2)

def forward(self, x):
    # forward function defines how the model will run
    x = F.relu(self.l1(x))
    x = self.l2(x)
    return (x)

class replay_memory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

def save(self, transition):
    # saves all transitions for the environment in a tensor
    self.memory.append(transition)
    if len(self.memory) > self.capacity:
        del self.memory[0]

def sample(self, batch_size):
    # generates a random sample from the memory
    return random.sample(self.memory, batch_size)

def __len__(self):
    return len(self.memory)


class agent():
    def __init__(self, env, model):
        self.epsilon = 1  # exploration rate
        self.epsilon_min = 0.001 # smallest exploration value
        self.epsilon_decay = 0.995 # rate at which exploration occurs
        self.learning_rate = 0.001


def act(self, state, model):
        # define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
    if random.uniform(0, 1) <= self.epsilon:
        action = torch.LongTensor([[random.randrange(2)]])
        action_np = (action.numpy())[0][0]
    else:

        action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
        action_np = (action.numpy())[0][0]


    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
    else:
        self.epsilon = self.epsilon_min

    return action, action_np


def trained_act(self, episodes, network, env):
    for e in range (episodes):
        state = env.reset()
        for t in range (200):
            action, action_np = agent.act(state, network)
            next_state, reward, done, info = env.step(action_np)
            env.render()
            if done:
                break
        print(t)

        env.close()

def learn(batch_size, gamma, memory, optimizer):
    BATCH_SIZE = batch_size
    if len(memory) < BATCH_SIZE:
        return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)

batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))


current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))

max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)

# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)

# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()

return loss



env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []

for e in range (episode):
    state = env.reset()
    for t in range (T):
        action, action_np = agent.act(state, network)
        next_state, reward, done, info = env.step(action_np)
        if done:
            reward = -2

    transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])

    memory.save(transition)
    state = next_state

    loss = learn(batch_size, gamma, memory, optimizer)
    l.append(loss)
    if done:
        print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
        s.append(t)
        break
导入健身房
随机输入
输入数学
进口火炬
导入torch.nn作为nn
将torch.optim导入为optim
从torch.autograd导入变量
导入torch.nn.功能为F
从集合导入namedtuple
将numpy作为np导入
类网络(nn.模块):
定义初始化(自):
nn.模块初始化(自)
#网络有4个输入(状态、动作、下一个状态、奖励),隐藏层有
#256个输入,网络有2个输出(向左或向右的q值)
#在此网络中,输出的索引引用操作。
self.l1=nn.Linear(4256)
self.l2=nn.Linear(256,2)
def前进(自身,x):
#forward函数定义模型将如何运行
x=F.relu(自l1(x))
x=self.l2(x)
返回(x)
类replay_memory():
定义初始值(自身,容量):
自我容量=容量
self.memory=[]
def保存(自我、转换):
#将环境的所有变换保存为张量
self.memory.append(转换)
如果len(自存储)>自容量:
del self.memory[0]
def样品(自身、批次大小):
#从内存中生成随机样本
返回random.sample(self.memory,批大小)
定义(自我):
返回len(自内存)
类代理():
定义初始化(自我、环境、模型):
self.epsilon=1#探索率
self.epsilon_min=0.001#最小勘探值
self.epsilon_衰减=0.995#勘探发生的速率
自学习率=0.001
def act(自身、状态、型号):
#根据勘探速率确定随机或最佳行动,但不考虑衰减
如果随机均匀(0,1)自ε_min:
self.epsilon*=self.epsilon\u衰变
其他:
self.epsilon=self.epsilon\u min
返回动作
def培训行动(自我、情节、网络、环境):
对于范围内的e(剧集):
state=env.reset()
对于范围(200)内的t:
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
env.render()
如果这样做:
打破
打印(t)
环境关闭()
def读入(批量大小、伽马、内存、优化器):
批次大小=批次大小
如果len(内存)<批量大小:
返回
#随机转换批次取自经验回放内存。
转换=内存。样本(批大小)
批次状态、批次行动、批次奖励、批次下一个批次状态=zip(*转换)
批次状态=变量(torch.cat(批次状态))
批次行动=变量(torch.cat(批次行动))
批次奖励=变量(torch.cat(批次奖励))
batch\u next\u state=变量(torch.cat(batch\u next\u state))
当前_q_值=network.forward(批处理状态).gather(1,批处理操作.unsqueze(-1))
max_next_q_values=network.forward(批处理下一步状态).detach().max(1)[0]
预期值=批量奖励+(伽马*最大下一个值)
#损耗通过电流和新预期Q值之间的误差来测量
损耗=F.平滑损耗(预期值、当前值)
#神经网络损耗的反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
回波损耗
env=健身房品牌(“CartPole-v0”)
环境重置()
网络=网络()
代理=代理(环境,网络)
批量大小=50
插曲=500
T=200
伽马=0.95
内存=回放内存(100)
optimizer=optim.SGD(network.parameters(),0.001)
l=[]
s=[]
对于范围内的e(插曲):
state=env.reset()
对于范围内的t(t):
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
如果这样做:
奖励=-2
转换=火炬.FloatTensor([状态]),火炬.LongTensor([动作]),火炬.FloatTensor([奖励]),火炬.FloatTensor([下一个状态])
内存保存(转换)
状态=下一个状态
损失=学习(批量大小、伽马、内存、优化器)
l、 追加(损失)
如果这样做:
打印('Loss={},eposion={},在{}步之后找到)。格式(Loss,e,t))
s、 附加(t)
打破

我会将您的训练算法改写为:

for e in range (episode):
    state = env.reset()
    done = False
    t = 0
    while not done:
        action, action_np = agent.act(state, network)
        next_state, reward, done, info = env.step(action_np)


        transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])

        memory.save(transition)
        state = next_state

        loss = learn(batch_size, gamma, memory, optimizer)
        l.append(loss)
        if t < T:
            t += 1
        else:
           done = True
        if done:
            print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
            s.append(t)
            break
范围内的e(插曲):
state=env.reset()
完成=错误
t=0
虽然没有这样做:
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
转换=火炬.FloatTensor([状态]),火炬.LongTensor([动作]),火炬.FloatTensor([奖励]),火炬.FloatTensor([下一个状态])
内存保存(转换)
状态=下一个状态
损失=学习(批量大小、伽马、内存、优化器)
l、 追加(损失)
如果t
您的重播内存非常小。在DQN论文中,他们使用的大小为
1*10**6
。你在问题中已经清楚地说明了问题,但如果你也能澄清你的问题也会很好。@Enthus3d问题是如何改进这一点/让它这样做的代码中有什么错误是的,这是一个很好的表达方式。如果你把它添加到你的帖子中,它会变得更好。@PySeeker我刚刚用增加的重播内存再次尝试了该代码,结果没有任何改进-不过谢谢你的想法!虽然这可能是对代码的改进,但不会导致培训结束时的分数提高。