Python 使用pytorch NN和Cartpole问题最小化分数
我试图通过在pytorch中训练一个简单的2层NN来解决openAI健身房中的CartPole问题。所使用的方法是DQN,但结果收敛于8或9分左右的最高分数,并且没有随着时间或训练而改善。相反,随着训练,分数会越来越低。如何改进这一点/让它这样做的代码中有什么错误?以下是使用的代码:Python 使用pytorch NN和Cartpole问题最小化分数,python,pytorch,openai-gym,Python,Pytorch,Openai Gym,我试图通过在pytorch中训练一个简单的2层NN来解决openAI健身房中的CartPole问题。所使用的方法是DQN,但结果收敛于8或9分左右的最高分数,并且没有随着时间或训练而改善。相反,随着训练,分数会越来越低。如何改进这一点/让它这样做的代码中有什么错误?以下是使用的代码: import gym import random import math import torch import torch.nn as nn import torch.optim as optim from to
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np
class network(nn.Module):
def __init__(self):
nn.Module.__init__(self)
# network takes 4 inputs (state, action, next_state, reward), hidden layer then has
# 256 inputs and the network has 2 outputs (the q value of going left or right)
# in this network the index of the output references the action.
self.l1 = nn.Linear(4, 256)
self.l2 = nn.Linear(256, 2)
def forward(self, x):
# forward function defines how the model will run
x = F.relu(self.l1(x))
x = self.l2(x)
return (x)
class replay_memory():
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def save(self, transition):
# saves all transitions for the environment in a tensor
self.memory.append(transition)
if len(self.memory) > self.capacity:
del self.memory[0]
def sample(self, batch_size):
# generates a random sample from the memory
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class agent():
def __init__(self, env, model):
self.epsilon = 1 # exploration rate
self.epsilon_min = 0.001 # smallest exploration value
self.epsilon_decay = 0.995 # rate at which exploration occurs
self.learning_rate = 0.001
def act(self, state, model):
# define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
if random.uniform(0, 1) <= self.epsilon:
action = torch.LongTensor([[random.randrange(2)]])
action_np = (action.numpy())[0][0]
else:
action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
action_np = (action.numpy())[0][0]
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
else:
self.epsilon = self.epsilon_min
return action, action_np
def trained_act(self, episodes, network, env):
for e in range (episodes):
state = env.reset()
for t in range (200):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
env.render()
if done:
break
print(t)
env.close()
def learn(batch_size, gamma, memory, optimizer):
BATCH_SIZE = batch_size
if len(memory) < BATCH_SIZE:
return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))
current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))
max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)
# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)
# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss
env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []
for e in range (episode):
state = env.reset()
for t in range (T):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
if done:
reward = -2
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
导入健身房
随机输入
输入数学
进口火炬
导入torch.nn作为nn
将torch.optim导入为optim
从torch.autograd导入变量
导入torch.nn.功能为F
从集合导入namedtuple
将numpy作为np导入
类网络(nn.模块):
定义初始化(自):
nn.模块初始化(自)
#网络有4个输入(状态、动作、下一个状态、奖励),隐藏层有
#256个输入,网络有2个输出(向左或向右的q值)
#在此网络中,输出的索引引用操作。
self.l1=nn.Linear(4256)
self.l2=nn.Linear(256,2)
def前进(自身,x):
#forward函数定义模型将如何运行
x=F.relu(自l1(x))
x=self.l2(x)
返回(x)
类replay_memory():
定义初始值(自身,容量):
自我容量=容量
self.memory=[]
def保存(自我、转换):
#将环境的所有变换保存为张量
self.memory.append(转换)
如果len(自存储)>自容量:
del self.memory[0]
def样品(自身、批次大小):
#从内存中生成随机样本
返回random.sample(self.memory,批大小)
定义(自我):
返回len(自内存)
类代理():
定义初始化(自我、环境、模型):
self.epsilon=1#探索率
self.epsilon_min=0.001#最小勘探值
self.epsilon_衰减=0.995#勘探发生的速率
自学习率=0.001
def act(自身、状态、型号):
#根据勘探速率确定随机或最佳行动,但不考虑衰减
如果随机均匀(0,1)自ε_min:
self.epsilon*=self.epsilon\u衰变
其他:
self.epsilon=self.epsilon\u min
返回动作
def培训行动(自我、情节、网络、环境):
对于范围内的e(剧集):
state=env.reset()
对于范围(200)内的t:
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
env.render()
如果这样做:
打破
打印(t)
环境关闭()
def读入(批量大小、伽马、内存、优化器):
批次大小=批次大小
如果len(内存)<批量大小:
返回
#随机转换批次取自经验回放内存。
转换=内存。样本(批大小)
批次状态、批次行动、批次奖励、批次下一个批次状态=zip(*转换)
批次状态=变量(torch.cat(批次状态))
批次行动=变量(torch.cat(批次行动))
批次奖励=变量(torch.cat(批次奖励))
batch\u next\u state=变量(torch.cat(batch\u next\u state))
当前_q_值=network.forward(批处理状态).gather(1,批处理操作.unsqueze(-1))
max_next_q_values=network.forward(批处理下一步状态).detach().max(1)[0]
预期值=批量奖励+(伽马*最大下一个值)
#损耗通过电流和新预期Q值之间的误差来测量
损耗=F.平滑损耗(预期值、当前值)
#神经网络损耗的反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
回波损耗
env=健身房品牌(“CartPole-v0”)
环境重置()
网络=网络()
代理=代理(环境,网络)
批量大小=50
插曲=500
T=200
伽马=0.95
内存=回放内存(100)
optimizer=optim.SGD(network.parameters(),0.001)
l=[]
s=[]
对于范围内的e(插曲):
state=env.reset()
对于范围内的t(t):
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
如果这样做:
奖励=-2
转换=火炬.FloatTensor([状态]),火炬.LongTensor([动作]),火炬.FloatTensor([奖励]),火炬.FloatTensor([下一个状态])
内存保存(转换)
状态=下一个状态
损失=学习(批量大小、伽马、内存、优化器)
l、 追加(损失)
如果这样做:
打印('Loss={},eposion={},在{}步之后找到)。格式(Loss,e,t))
s、 附加(t)
打破
我会将您的训练算法改写为:
for e in range (episode):
state = env.reset()
done = False
t = 0
while not done:
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if t < T:
t += 1
else:
done = True
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
范围内的e(插曲):
state=env.reset()
完成=错误
t=0
虽然没有这样做:
action,action\u np=agent.act(状态,网络)
下一步状态,奖励,完成,信息=环境步骤(操作)
转换=火炬.FloatTensor([状态]),火炬.LongTensor([动作]),火炬.FloatTensor([奖励]),火炬.FloatTensor([下一个状态])
内存保存(转换)
状态=下一个状态
损失=学习(批量大小、伽马、内存、优化器)
l、 追加(损失)
如果t
您的重播内存非常小。在DQN论文中,他们使用的大小为1*10**6
。你在问题中已经清楚地说明了问题,但如果你也能澄清你的问题也会很好。@Enthus3d问题是如何改进这一点/让它这样做的代码中有什么错误是的,这是一个很好的表达方式。如果你把它添加到你的帖子中,它会变得更好。@PySeeker我刚刚用增加的重播内存再次尝试了该代码,结果没有任何改进-不过谢谢你的想法!虽然这可能是对代码的改进,但不会导致培训结束时的分数提高。