Python DQN TensorFlow代码很快就会耗尽内存_Python_Tensorflow_Memory Leaks

Python DQN TensorFlow代码很快就会耗尽内存

python tensorflow memory-leaks

Python DQN TensorFlow代码很快就会耗尽内存,python,tensorflow,memory-leaks,Python,Tensorflow,Memory Leaks,我正在尝试使用DQN训练海龟机器人模拟。海龟机器人应该在迷宫中找到目标。这是相当简单的，它正在收敛。我的问题是，经过几次跑步后，训练会变得非常缓慢。开始时速度很快，但跑了50英里后速度变得很慢。我已经检查了这个问题，我的CPU甚至没有被使用50%，但是我的内存被占用了，大约98%的内存被占用了。在我的代码中的某个地方，我正在泄漏内存，我认为t在我的DQN代理的初始化中。你能告诉我问题是什么，我如何解决它吗非常感谢以下是基于DQN和优先级缓冲区的培训代码： #!/usr/bin/env pyt

我正在尝试使用DQN训练海龟机器人模拟。海龟机器人应该在迷宫中找到目标。这是相当简单的，它正在收敛。我的问题是，经过几次跑步后，训练会变得非常缓慢。开始时速度很快，但跑了50英里后速度变得很慢。我已经检查了这个问题，我的CPU甚至没有被使用50%，但是我的内存被占用了，大约98%的内存被占用了。在我的代码中的某个地方，我正在泄漏内存，我认为t在我的DQN代理的初始化中。你能告诉我问题是什么，我如何解决它吗

非常感谢

以下是基于DQN和优先级缓冲区的培训代码：

#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform 
import datetime

import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#--------------------------------------------------------------------------------------------------------------------------------------
def render():
    render_skip       = 0 #Skip first X episodes.
    render_interval   = 50 #Show render Every Y episodes.
    render_episodes   = 10 #Show Z episodes every rendering.

    if (x%render_interval == 0) and (x != 0) and (x > render_skip):
        env.render()
    elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
        env.render(close=True)

#--------------------------------------------------------------------------------------------------------------------------------------

if __name__ == '__main__':
    #------------------------------------------------------------------------
    env               = gym.make('GazeboCircuit2TurtlebotLidar-v0')
    outdir            = '/tmp/gazebo_gym_experiments'
    env               = gym.wrappers.Monitor(env, outdir, force=True)
    plotter           = liveplot.LivePlot(outdir)
    last_time_steps   = np.ndarray(0)
    start_time        = time.time()
    total_episodes    = 1000
    max_steps         = 200
    highest_reward    = 0
    gamma             = 0.95
    num_actions       = 3
    action_space      = [0,1,2]
    tf.reset_default_graph()                             # Reset training graph                                   
    myinit            = tf.global_variables_initializer()# Initialize training network 

    #tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.set_verbosity(tf.logging.ERROR)
    #------------------------------------------------------------------------
    agent             = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")

    agent.exploration = 1
    cv2.namedWindow("window", 1)
    x_val = np.random.rand(4096,256).astype(np.float32)
    agent.W_fc1.load(x_val, session=agent.sess)

    for e in range(total_episodes):
        # reset
        linecount = 0
        terminal= False
        win     = 0
        frame   = 0
        loss    = 0.0
        Q_max   = 0.0
        steps   = 0
        reward_t= 0.0
        env.reset()
        cumulated_rewards  = 0
        agent.exploration *= 0.9
        if agent.exploration<0.1:
            agent.exploration=0.1

        _, reward, terminal, info = env.step(0)

        linecount += 1
        print( "Time %s, %s" %(linecount,datetime.datetime.now()))
        img_tmp     = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
        linecount += 1
        print( "Time %s, %s" %(linecount,datetime.datetime.now()))
        state_t_1   = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
        state_t_1   = tf.reshape(state_t_1,(-1,32,32,4))


        while (not terminal):
            steps  += 1
            state_t = state_t_1

            # execute action in environment
            action_t = agent.select_action(state_t, agent.exploration)
            _, reward_t, terminal, info = env.step(action_t)
            #print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
            print(action_t , end="")
            img_tmp     = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
            state_t_1   = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
            state_t_1   = tf.reshape(state_t_1,(-1,32,32,4))
            # store experience
            agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
            # experience replay
            agent.experience_replay()
            #print(agent.sess.run(agent.W_fc1))

            # for log
            frame += 1
            loss  += agent.current_loss
            Q_max += np.max(agent.Q_values(state_t))
            cumulated_rewards += reward_t



        print(" ")
        print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
        plotter.plot(env)
        #print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
        #    e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
        env._flush(force=True)
        # save model
        weights=agent.sess.run(agent.W_fc1)
        print(weights)
        weights_tmp     = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
        weights_image   = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
        cv2.imshow("window",agent.sess.run(weights_image))
        cv2.waitKey(1)

    # save model
    agent.save_model()    

    env.close()

#/usr/bin/env python
从未来导入绝对导入
来自未来进口部
来自未来导入打印功能
进口健身房
进口健身房露台
导入tensorflow作为tf
将numpy作为np导入
导入时间
随机输入
从随机导入*
进口cv2
从健身房导入包装器
从略读导入转换
导入日期时间
导入liveplot
来自dqn_代理，目标导入DQNAgent
导入操作系统
操作系统环境['TF\u CPP\u MIN\u LOG\u LEVEL']=“2”
#--------------------------------------------------------------------------------------------------------------------------------------
def render（）：
render_skip=0#跳过前X集。
渲染间隔=50#每Y集显示一次渲染。
渲染每集=10集#每次渲染显示Z集。
如果（x%render_interval==0）和（x！=0）以及（x>render_skip）：
env.render（）
elif（（x-render\u剧集）%render\u interval==0）和（x！=0）以及（x>render\u skip）和（render\u剧集from collections import deque
import os

import numpy as np
import tensorflow as tf


class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""

def __init__(self, enable_actions, environment_name):
    # parameters
    self.name = os.path.splitext(os.path.basename(__file__))[0]
    self.environment_name = environment_name
    self.enable_actions = enable_actions
    self.n_actions = len(self.enable_actions)
    self.minibatch_size = 64
    self.replay_memory_size = 1000
    self.learning_rate = 0.001
    self.discount_factor = 0.9
    self.exploration = 1.0
    self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
    self.model_name = "{}.ckpt".format(self.environment_name)

    # replay memory
    self.D = deque(maxlen=self.replay_memory_size)

    # model
    self.init_model()

    # variables
    self.current_loss = 0.0

def init_model(self):

    #policy##################################################################################

    # input layer (32 x 32 x 4)
    self.x = tf.placeholder(tf.float32, [None, 32, 32,4])

    # convolution layer
    self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
    self.b_cv1 = tf.Variable(tf.zeros([4]))
    self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
    self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)

    # flatten (4096)
    self.x_flat = tf.reshape(self.h_cv1, [-1,4096])

    # fully connected layer [1,256]
    self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
    self.b_fc1 = tf.Variable(tf.zeros([256]))
    self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)

    # fully connected layer [1,32]
    self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
    self.b_fc2 = tf.Variable(tf.zeros([32]))
    self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)

    # output layer (n_actions)
    self.W_out  = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
    self.b_out  = tf.Variable(tf.zeros([self.n_actions]))
    self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out

    # loss function
    self.y_   = tf.placeholder(tf.float32, [None, self.n_actions])
    self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))

    # train operation
    optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
    self.training = optimizer.minimize(self.loss)

    #target######################################################################################

    # input layer (32 x 32 x 4)
    self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])

    # convolution layer
    self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
    self.b_cv1_t = tf.Variable(tf.zeros([4]))
    self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
    self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)

    # flatten (4096)
    self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])

    # fully connected layer [1,256]
    self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
    self.b_fc1_t = tf.Variable(tf.zeros([256]))
    self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)

    # fully connected layer [1,32]
    self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
    self.b_fc2_t = tf.Variable(tf.zeros([32]))
    self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)

    # output layer (n_actions)
    self.W_out_t  = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
    self.b_out_t  = tf.Variable(tf.zeros([self.n_actions]))
    self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out

    # loss function
    self.y__t   = tf.placeholder(tf.float32, [None, self.n_actions])
    self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))

    # train operation
    optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
    self.training_t = optimizer.minimize(self.loss)

    #general################################################################################

    # saver
    self.saver = tf.train.Saver()

    # session
    self.sess = tf.Session()
    self.sess.run(tf.global_variables_initializer())

def Q_values(self, state):
    # Q(state, action) of all actions
    #print("QQQ VALUES______________________________________________",self.sess.run(state))
    x_tmp             = self.sess.run(state)
    return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]

def select_action(self, state, epsilon):
    if np.random.rand() <= epsilon:
        # random
        return np.random.choice(self.enable_actions)
    else:
        # max_action Q(state, action)
        #print("G" , end="")
        return self.enable_actions[np.argmax(self.Q_values(state))]

def store_experience(self, state, action, reward, state_1, terminal):
    self.D.append((state, action, reward, state_1, terminal))

def experience_replay(self):
    state_minibatch = []
    y_minibatch = []

    # sample random minibatch
    minibatch_size = min(len(self.D), self.minibatch_size)
    minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)

    for j in minibatch_indexes:
        state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
        action_j_index = self.enable_actions.index(action_j)

        y_j = self.Q_values(state_j)[0]

        if terminal:
            y_j[action_j_index] = reward_j
        else:
            # reward_j + gamma * max_action' Q(state', action')
            y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1))  # NOQA

        x_tmp = self.sess.run(state_j)
        y_j=np.reshape(y_j,(1,3))
        state_minibatch.append(x_tmp[0])
        y_minibatch.append(y_j[0])

    # training 
    self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})

    # for log
    self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})

def load_model(self, model_path=None):
    if model_path:
        # load from model_path
        self.saver.restore(self.sess, model_path)
    else:
        # load from checkpoint
        checkpoint = tf.train.get_checkpoint_state(self.model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)

def save_model(self):
    self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))