R语言中的n武装土匪仿真

R语言中的n武装土匪仿真,r,simulation,reinforcement-learning,R,Simulation,Reinforcement Learning,我正在使用萨顿和巴托的电子书《强化学习:学习强化学习入门》。我在尝试模拟计算机上的结果(绘图)时遇到了一些问题 更具体地说,我如何模拟每个任务的贪婪值?书上说: …我们可以将各种方法的性能和行为绘制为 他们在1000多场比赛中积累了丰富的经验 所以我想我必须跟踪探索性的价值观,因为更好的价值观会被发现。问题是如何使用贪婪的方法做到这一点——既然没有探索性的动作,我怎么知道什么是贪婪行为 谢谢你的评论和回答 更新:请参阅我答案中的代码。根据我们的聊天记录,到目前为止,这是我得到的: set.see

我正在使用萨顿和巴托的电子书《强化学习:学习强化学习入门》。我在尝试模拟计算机上的结果(绘图)时遇到了一些问题

更具体地说,我如何模拟每个任务的
贪婪
值?书上说:

…我们可以将各种方法的性能和行为绘制为 他们在1000多场比赛中积累了丰富的经验

所以我想我必须跟踪探索性的价值观,因为更好的价值观会被发现。问题是如何使用贪婪的方法做到这一点——既然没有探索性的动作,我怎么知道什么是贪婪行为

谢谢你的评论和回答


更新:请参阅我答案中的代码。

根据我们的聊天记录,到目前为止,这是我得到的:

set.seed(1)

getRewardsGaussian <- function(arms, plays) {
## assuming each action has a normal distribution 

  # first generate new means
  QStar <- rnorm(arms, 0, 1)

  # then for each mean, generate `play`-many samples
  sapply(QStar, function(u)
    rnorm(plays, u, 1))
}


CalculateRewardsPerMethod <- function(arms=7, epsi1=0.01, epsi2=0.1
                    , plays=1000, methods=c("greedy", "epsi1", "epsi2")) {

  # names for easy handling
  names(methods) <- methods
  arm.names <- paste0("Arm", ifelse((1:arms)<10, 0, ""), 1:arms)

  # this could be different if not all actions' rewards have a gaussian dist.
  rewards.source <- getRewardsGaussian(arms, plays) 

  # Three dimensional array to track running averages of each method
  running.avgs <- 
    array(0, dim=c(plays, arms, length(methods))
           , dimnames=list(PlayNo.=NULL, Arm=arm.names, Method=methods))

  # Three dimensional array to track the outcome of each play, according to each method 
  rewards.received <- 
    array(NA_real_, dim=c(plays, 2, length(methods))
                  , dimnames=list(PlayNo.=seq(plays), Outcome=c("Arm", "Reward"), Method=methods))


  # define the function internally to not have to pass running.avgs 
  chooseAnArm <- function(p) {
    # Note that in a tie, which.max returns the lowest value, which is what we want
    maxes <- apply(running.avgs[p, ,methods, drop=FALSE], 3, which.max)

    # Note: deliberately drawing two separate random numbers and keeping this as 
    #       two lines of code to accent that the two draws should not be related 
    if(runif(1) < epsi1)
      maxes["epsi1"] <- sample(arms, 1)

    if(runif(1) < epsi2)
      maxes["epsi2"] <- sample(arms, 1)

    return(maxes)
  }

  ## TODO:  Perform each action at least once, then select according to algorithm
  ## Starting points. Everyone starts at machine 3
  choice <- c(3, 3, 3)
  reward <- rewards.source[1, choice]
  ## First run, slightly different
  rewards.received[1,,] <- rbind(choice, reward)
  running.avgs[1, choice, ] <- reward # if different starting points, this needs to change like below

  ## HERE IS WHERE WE START PULLING THE LEVERS ##
  ## ----------------------------------------- ##
  for (p in 2:plays) {
    choice <- chooseAnArm(p)
    reward <- rewards.source[p, choice]

    # Note: When dropping a dim, the methods will be the columns 
    #       and the Outcome info will be the rows. Use `rbind` instead of `cbind`.
    rewards.received[p,,names(choice)] <- rbind(choice, reward)

    ## Update the running averages. 
    ## For each method, the current running averages are the same as the
    ##    previous for all arms, except for the one chosen this round.
    ##    Thus start with last round's averages, then update the one arm.
    running.avgs[p,,] <- running.avgs[p-1,,]

    # The updating is only involved part (due to lots of array-indexing)
    running.avgs[p,,][cbind(choice, 1:3)] <- 
     sapply(names(choice), function(m) 
       # Update the running average for the selected arm (for the current play & method) 
          mean( rewards.received[ 1:p,,,drop=FALSE][ rewards.received[1:p,"Arm",m] == choice[m],"Reward",m])
     )
  } # end for-loop


  ## DIFFERENT RETURN OPTIONS ##
  ## ------------------------ ##


  ## All rewards received, in simplifed matrix (dropping information on arm chosen)
  # return(rewards.received[, "Reward", ])

  ## All rewards received, along with which arm chosen: 
  #   return(rewards.received)

  ## Running averages of the rewards received by method
  return( apply(rewards.received[, "Reward", ], 2, cumsum) / (1:plays) )

}


### EXECUTION (AND SIMULATION)

## PARAMETERS
arms   <- 10
plays  <- 1000
epsi1  <- 0.01
epsi2  <- 0.1
simuls <- 50  # 2000
methods=c("greedy", "epsi1", "epsi2")

## Single Iteration: 
### we can run system time to get an idea for how long one will take
tme <- system.time( CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays) )
cat("Expected run time is approx: ", round((simuls * tme[["elapsed"]]) / 60, 1), " minutes")

## Multiple iterations (simulations)
rewards.received.list <- replicate(simuls, CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays), simplify="array")

## Compute average across simulations
rewards.received <- apply(rewards.received.list, 1:2, mean)

## RESULTS
head(rewards.received, 17)
MeanRewards <- rewards.received

## If using an alternate return method in `Calculate..` use the two lines below to calculate running avg
#   CumulRewards <- apply(rewards.received, 2, cumsum)
#   MeanRewards  <- CumulRewards / (1:plays)

## PLOT
plot.ts(MeanRewards[, "greedy"], col = 'red', lwd = 2, ylim = range(MeanRewards), ylab = 'Average reward', xlab="Plays")
  lines(MeanRewards[, "epsi1"], col = 'orange', lwd = 2)
  lines(MeanRewards[, "epsi2"], col = 'navy', lwd = 2)
  grid(col = 'darkgray')

  legend('bottomright', c('greedy', paste("epsi1 =", epsi1), paste("epsi2 =", epsi2)), col = c('red', 'orange', 'navy'), lwd = 2, cex = 0.8)
set.seed(1)

getRewardsGaussian我终于弄对了。正如书中所指出的,eps玩家应该击败贪婪的玩家,因为他们有探索性的动作。 代码很慢,需要进行一些优化,但这里是:

get.testbed = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1){

  optimal = rnorm(arms, u, sdev.arm)
  rewards = sapply(optimal, function(x)rnorm(plays, x, sdev.rewards))

  list(optimal = optimal, rewards = rewards)
}

play.slots = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = 0.1){

  testbed = get.testbed(arms, plays, u, sdev.arm, sdev.rewards)
  optimal = testbed$optimal
  rewards = testbed$rewards

  optim.index = which.max(optimal)
  slot.rewards = rep(0, arms)
  reward.hist = rep(0, plays)
  optimal.hist = rep(0, plays)
  pulls = rep(0, arms)
  probs = runif(plays)

  # vetorizar
  for (i in 1:plays){

      ## dont use ifelse() in this case
      ## idx = ifelse(probs[i] < eps, sample(arms, 1), which.max(slot.rewards))

      idx = if (probs[i] < eps) sample(arms, 1) else which.max(slot.rewards)
      reward.hist[i] = rewards[i, idx]

      if (idx == optim.index)
        optimal.hist[i] = 1

      slot.rewards[idx] = slot.rewards[idx] + (rewards[i, idx] - slot.rewards[idx])/(pulls[idx] + 1)
      pulls[idx] = pulls[idx] + 1
  }

  list(slot.rewards = slot.rewards, reward.hist = reward.hist, optimal.hist = optimal.hist, pulls = pulls)
}

do.simulation = function(N = 100, arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = c(0.0, 0.01, 0.1)){

  n.players = length(eps)
  col.names = paste('eps', eps)
  rewards.hist = matrix(0, nrow = plays, ncol = n.players)
  optim.hist = matrix(0, nrow = plays, ncol = n.players)
  colnames(rewards.hist) = col.names
  colnames(optim.hist) = col.names

  for (p in 1:n.players){
    for (i in 1:N){
      play.results = play.slots(arms, plays, u, sdev.arm, sdev.rewards, eps[p])
      rewards.hist[, p] = rewards.hist[, p] + play.results$reward.hist
      optim.hist[, p] = optim.hist[, p] + play.results$optimal.hist
    } 
  }

  rewards.hist = rewards.hist/N
  optim.hist = optim.hist/N
  optim.hist = apply(optim.hist, 2, function(x)cumsum(x)/(1:plays))

  ### Plot helper ###
  plot.result = function(x, n.series, colors, leg.names, ...){
    for (i in 1:n.series){
      if (i == 1)
        plot.ts(x[, i], ylim = 2*range(x), col = colors[i], ...)
      else
        lines(x[, i], col = colors[i], ...)
      grid(col = 'lightgray')
    }
    legend('topleft', leg.names, col = colors, lwd = 2, cex = 0.6, box.lwd = NA)
  }
  ### Plot helper ###

  #### Plots ####
  require(RColorBrewer)
  colors = brewer.pal(n.players + 3, 'Set2')
  op <-par(mfrow = c(2, 1), no.readonly = TRUE)

  plot.result(rewards.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Average reward', lwd = 2)
  plot.result(optim.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Optimal move %', lwd = 2)
  #### Plots ####

  par(op)
}


您可能还需要检查此链接

上述来源的相关代码副本 它不使用R,只使用numpy中的np.random.rand()

class eps_bandit:
'''
epsilon-greedy k-bandit problem

Inputs
=====================================================
k: number of arms (int)
eps: probability of random action 0 < eps < 1 (float)
iters: number of steps (int)
mu: set the average rewards for each of the k-arms.
    Set to "random" for the rewards to be selected from
    a normal distribution with mean = 0. 
    Set to "sequence" for the means to be ordered from 
    0 to k-1.
    Pass a list or array of length = k for user-defined
    values.
'''

def __init__(self, k, eps, iters, mu='random'):
    # Number of arms
    self.k = k
    # Search probability
    self.eps = eps
    # Number of iterations
    self.iters = iters
    # Step count
    self.n = 0
    # Step count for each arm
    self.k_n = np.zeros(k)
    # Total mean reward
    self.mean_reward = 0
    self.reward = np.zeros(iters)
    # Mean reward for each arm
    self.k_reward = np.zeros(k)

    if type(mu) == list or type(mu).__module__ == np.__name__:
        # User-defined averages            
        self.mu = np.array(mu)
    elif mu == 'random':
        # Draw means from probability distribution
        self.mu = np.random.normal(0, 1, k)
    elif mu == 'sequence':
        # Increase the mean for each arm by one
        self.mu = np.linspace(0, k-1, k)

def pull(self):
    # Generate random number
    p = np.random.rand()
    if self.eps == 0 and self.n == 0:
        a = np.random.choice(self.k)
    elif p < self.eps:
        # Randomly select an action
        a = np.random.choice(self.k)
    else:
        # Take greedy action
        a = np.argmax(self.k_reward)

    reward = np.random.normal(self.mu[a], 1)

    # Update counts
    self.n += 1
    self.k_n[a] += 1

    # Update total
    self.mean_reward = self.mean_reward + (
        reward - self.mean_reward) / self.n

    # Update results for a_k
    self.k_reward[a] = self.k_reward[a] + (
        reward - self.k_reward[a]) / self.k_n[a]

def run(self):
    for i in range(self.iters):
        self.pull()
        self.reward[i] = self.mean_reward

def reset(self):
    # Resets results while keeping settings
    self.n = 0
    self.k_n = np.zeros(k)
    self.mean_reward = 0
    self.reward = np.zeros(iters)
    self.k_reward = np.zeros(k)
eps\u班迪特级:
'''
ε贪心k-bandit问题
投入
=====================================================
k:武器数量(整数)
eps:随机行动的概率0
您也可以选择使用R软件包“”,该软件包旨在简化上下文无关(如Sutton&Barto所述)和上下文相关(如for)多武装匪徒策略的实施和评估

该软件包实际上是关于如何复制所有萨顿和巴托强盗的阴谋。例如,要生成ε-贪婪图,只需针对以下对象模拟策略:

库(上下文)
种子(2)

mus您得到了什么输出以及您希望得到什么(另外,您可能希望添加一个
seed()
,以便其他人可以复制。)问题是在给定向量V(x的每列)的情况下获得贪婪值。如果runif(1)n
play
,直到代码按预期工作。这里有些奇怪。。。由于每个元素都将是
x
中相应列对应项的
max
,因此
rewards.greedy
中的每一行都将是相同的。贪婪应该如何被选中?如果你知道Lisp,我想Lisp代码在这里:也许它可以翻译成R。R包
bandit
可能会有帮助,但我不知道。谢谢Mark。我不知道lisp,但我会检查一下,在测试代码之前,它看起来很不错。我更新我的,唯一知道的问题是当eps=0(100%贪婪)时如何“更新”贪婪值。在其他情况下,它工作正常。贪婪方法总是包括在内。它在“running.avgs[p,,][cbind(choice,1:3)]行中更新,我仍然不知道如何“玩贪婪”。我的贪婪曲线总是0。这很好。您是否愿意像书中那样发布比较三种策略的代码?也许我可以稍后再讨论这个问题,并找出如何添加第三种策略。谢谢你发布这篇文章。非常好的代码。关于代码行idx=if(probs[i]当slot.rewards=rep(0,arms)时,只需对idx=if(probs[i]class eps_bandit: ''' epsilon-greedy k-bandit problem Inputs ===================================================== k: number of arms (int) eps: probability of random action 0 < eps < 1 (float) iters: number of steps (int) mu: set the average rewards for each of the k-arms. Set to "random" for the rewards to be selected from a normal distribution with mean = 0. Set to "sequence" for the means to be ordered from 0 to k-1. Pass a list or array of length = k for user-defined values. ''' def __init__(self, k, eps, iters, mu='random'): # Number of arms self.k = k # Search probability self.eps = eps # Number of iterations self.iters = iters # Step count self.n = 0 # Step count for each arm self.k_n = np.zeros(k) # Total mean reward self.mean_reward = 0 self.reward = np.zeros(iters) # Mean reward for each arm self.k_reward = np.zeros(k) if type(mu) == list or type(mu).__module__ == np.__name__: # User-defined averages self.mu = np.array(mu) elif mu == 'random': # Draw means from probability distribution self.mu = np.random.normal(0, 1, k) elif mu == 'sequence': # Increase the mean for each arm by one self.mu = np.linspace(0, k-1, k) def pull(self): # Generate random number p = np.random.rand() if self.eps == 0 and self.n == 0: a = np.random.choice(self.k) elif p < self.eps: # Randomly select an action a = np.random.choice(self.k) else: # Take greedy action a = np.argmax(self.k_reward) reward = np.random.normal(self.mu[a], 1) # Update counts self.n += 1 self.k_n[a] += 1 # Update total self.mean_reward = self.mean_reward + ( reward - self.mean_reward) / self.n # Update results for a_k self.k_reward[a] = self.k_reward[a] + ( reward - self.k_reward[a]) / self.k_n[a] def run(self): for i in range(self.iters): self.pull() self.reward[i] = self.mean_reward def reset(self): # Resets results while keeping settings self.n = 0 self.k_n = np.zeros(k) self.mean_reward = 0 self.reward = np.zeros(iters) self.k_reward = np.zeros(k)
library(contextual)

set.seed(2)
mus             <- rnorm(10, 0, 1)
sigmas          <- rep(1, 10)
bandit          <- BasicGaussianBandit$new(mu_per_arm = mus, sigma_per_arm = sigmas)

agents          <- list(Agent$new(EpsilonGreedyPolicy$new(0),    bandit, "e = 0, greedy"),
                        Agent$new(EpsilonGreedyPolicy$new(0.1),  bandit, "e = 0.1"),
                        Agent$new(EpsilonGreedyPolicy$new(0.01), bandit, "e = 0.01"))

simulator       <- Simulator$new(agents = agents, horizon = 1000, simulations = 2000)
history         <- simulator$run()

plot(history, type = "average", regret = FALSE, lwd = 1, legend_position = "bottomright")
plot(history, type = "optimal", lwd = 1, legend_position = "bottomright")