Openai'；s PPO2模型在我的自定义环境（Python）中单步执行时返回NAN_Python_Openai Gym

Openai'；s PPO2模型在我的自定义环境（Python）中单步执行时返回NAN

python

Openai'；s PPO2模型在我的自定义环境（Python）中单步执行时返回NAN,python,openai-gym,Python,Openai Gym,我创建了以下自定义环境： class Market(gym.Env): """This env is for training a BUYING vwap beating algo, with OpenAI gym reinforcemnt learning algorithms""" metadata = {'render.modes': ['human']} def __init__(self, list_of_df): super(Mar

我创建了以下自定义环境：

class Market(gym.Env):
    """This env is for training a BUYING vwap beating algo, with 
    OpenAI gym reinforcemnt learning algorithms"""
    metadata = {'render.modes': ['human']}

    def __init__(self, list_of_df):
        super(Market, self).__init__()

        self.list_of_df = list_of_df
        self.current_day = list_of_df[0]
        self.reward_range = (-2147483647, 2147483647)
        # self.A_Vol = 0
        self.current_step = 0
        self.last_ind_in_day = len(list_of_df[0]) - 1
        # self.trade_size = 10
        self.A_VWAP = 0
        self.A_rolling_vol  = 0
        self.A_rolling_price = 0
        self.A_vol_left = 1000
        self.reward = 0
        self.done = False

        # To keep track of the AGENTS VWAP:
        self.cum_VbyP = 0
        self.cum_vol_traded = 0
        self.purchase_vol = 80


        self.action_space =   spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)


        # Prices contains the OHLC for the 5 min interval
        # Miliseconds from midnight
        # Rolling VWAP for this time period
        # The agents Rolling VWAP, A_VWAP
        # The Vol of securities left to still buy, A_trgt_vol
        # The Vol traded this time step in the market
        self.observation_space = spaces.Box(
            low=-2147483647, high=2147483647, shape=(1, len(list_of_df[1].iloc[2])), dtype=np.float16)

    def _take_action(self, a):

        # Only buy if there are still shares to be bought today,
        if (self.A_vol_left > 0):

          # Purchase a * volume of a trade
          vol = self.purchase_vol * a[0] 
          print(vol)
          # But if there arent enough shares still to buy
          if (vol > self.A_vol_left):
            vol = self.A_vol_left

          self.A_vol_left = self.A_vol_left - vol

          # Increase the volume of shares traded:
          self.cum_vol_traded = self.cum_vol_traded + vol

          if (vol > 0):
            # Sample a random price between high and low for this interval:
            price = round( random.uniform(self.current_day['Low'].iloc[self.current_step],
                                  self.current_day['High'].iloc[self.current_step]))

            # Update cumulative price multiplied by volume:
            self.cum_VbyP = self.cum_VbyP + (vol * price)
            # Update the Agents VWAP, A_VWAP
            self.A_VWAP = self.cum_VbyP / self.cum_vol_traded

    def _next_observation(self):


        frame = np.array([ self.current_day.iloc[self.current_step]])
        frame[:,-1] = self.A_VWAP
        frame[:,-2] = self.A_vol_left

        return frame



    def step(self, action):
        # Execute one time step within the environment
        print(action)
        self._take_action(action)

        self.current_step += 1


        reward = 0 # always return zero until the last day
        if (self.current_step==self.last_ind_in_day):
          if(self.A_vol_left<1):
            reward = self.current_day['VWAP'].iloc[self.current_step] - self.A_VWAP
          else: reward =-999999
          self.done = True



        obs = self._next_observation()

        return obs, reward, self.done, {}

    def reset(self):
        # Reset the state of the environment to an initial random day
        ind = random.randrange(0,len(self.list_of_df))
        self.current_day = self.list_of_df[ind]

        # Set the current step to a random point within the data frame
        self.current_step = 1
        # self.last_ind_in_day = len(self.list_of_df[0]) - 1
        self.A_VWAP = 0
        self.A_rolling_vol  = 0
        self.A_rolling_price = 0
        self.A_vol_left = 1000
        self.reward = 0
        self.done = False
        self.last_ind_in_day = len(self.list_of_df[ind]) - 1

        # To keep track of the AGENTS VWAP:
        self.cum_VbyP = 0
        self.cum_vol_traded = 0

        return self._next_observation()


# ====================== End of MARKET class =======================

我在Environment take action和step方法中添加了一个打印，以显示正在执行的操作，它们始终是：

[楠楠]

这是我的github，带有我从google colab运行的完整ipython笔记本：

我遇到了与您类似的问题，然后我删除了

列车df

和

测试df

部分，因为我实际上不需要它们。那么我的问题就解决了。我想你的问题也可能与数据集有关

此外，还有一个博客：作者和你的问题设置类似

train_env = DummyVecEnv([lambda: Market(train_df)])
test_env = DummyVecEnv([lambda: Market(test_df)])


model = PPO2('MlpLstmPolicy', train_env, nminibatches=1, verbose=0)
n_stepss = 2000
for i in range(1000):
  model.learn(n_stepss)