Q-学习R的长度为零
我正在尝试输入一个简化21点游戏的模拟,该游戏将返回每个州的最佳策略 21点模拟似乎工作正常,但我在尝试应用Q学习算法以达到最佳策略时,不知何故出现了一个错误 这是我的代码,我相信它有很好的文档记录,错误在Q-learning块中,从第170行开始,它也是可复制的:Q-学习R的长度为零,r,machine-learning,reinforcement-learning,q-learning,R,Machine Learning,Reinforcement Learning,Q Learning,我正在尝试输入一个简化21点游戏的模拟,该游戏将返回每个州的最佳策略 21点模拟似乎工作正常,但我在尝试应用Q学习算法以达到最佳策略时,不知何故出现了一个错误 这是我的代码,我相信它有很好的文档记录,错误在Q-learning块中,从第170行开始,它也是可复制的: #Application reinforcement learning for black jack. We will suppose here that the croupier only has 1 pack of cards
#Application reinforcement learning for black jack. We will suppose here that the croupier only has 1 pack of cards
#Initial tabs
packinit = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
#In our game and for simplicifaction of the problem, aces will always count as 1. Other figures are worth 10.
#If both player and croupier have same score, then player looses.
#Croupier will draw cards until he has 17 or more.
handPinit = NULL # will contain hand of player
handCinit = NULL # will contain hand of the croupier
list = list(handPinit, handCinit, packinit)
# Methods ####################################################################################
##############################################################################################
#Random integer, returns an integer to choose card
randInt = function(pack){
int = runif(1) * length(pack)
int = int+1
int = round(int, 0)
return(int)
}
#Picks a card, asimResults it to the desired hand and deletes it from the package.
pickC = function(hand, pack){
int = randInt(pack)
hand = c(hand, pack[int])
pack = pack[-int]
return(list(hand, pack))
}
score = function(handC){
return(sum(handC, na.rm = T))
}
printWinner = function(resultList){
res = resultList[[4]]
p = res[1]
c = res[2]
if((p > c && p <= 21) || (p <= 21 && c > 21)){
cat("Player has won with ", p, ", croupier has ", c, ".\n", sep = "")
}else{
cat("Player has lost with ", p, ", croupier has ", c, ".\n", sep = "")
}
}
#Black jack sim :
simulation = function(handP, handC, pack){
#Matrix to stock choice and next state, 1st is state, 2nd is choice, 3rd is reward, 4th is start state
cs = NULL
#pick first card
temp = NULL
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
#stock result
cs = rbind(cs, c(score(handP), 1, 0.1, 0))
#pick second card
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
#stock result
cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1]))
#reward stock final
reward = NULL
#to change with algo decision
while(score(handC) < 17){
#rand number to choose action, 1 = draw
rand = round(2*runif(1),0)
#if a = 1, draw a card
if(rand == 1 && score(handP) < 21){
temp = pickC(handP, pack)
handP = temp[[1]]
pack = temp[[2]]
cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1] ))
}else{
cs = rbind(cs, c(score(handP), 0, 0.1, cs[length(cs[,1]), 1]))
}
#if croupier < 17, he draws a card
if(score(handC) < 17){
temp = pickC(handC, pack)
handC = temp[[1]]
pack = temp[[2]]
}
}
#get scores
scores = c(score(handP), score(handC))
resultList = list(handP, handC, pack, scores)
#get reward
res = resultList[[4]]
p = res[1]
c = res[2]
if((p > c && p <= 21) || (p <= 21 && c > 21)){
reward = 100
}else{
reward = -50
}
#AsimResults reward as the reward of the last line of cs
cs[length(cs[,1]), 3] = reward
#return full list
resultList = list(handP, handC, pack, scores, cs)
return(resultList)
}
#Function for simulation, outputs tab containins states, actions and choices
simRand = function(k){
resultsRand = NULL
for(i in 1:k){
#init pack and hands
pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
handC = NULL
handP = NULL
#simulation k
res = simulation(handP, handC, pack)
resultsRand = rbind(resultsRand, res[[5]])
#resets for next iteration
pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
rep(9,4),rep(10,16))
handC = NULL
handP = NULL
}
return(resultsRand)
}
#test
for(i in 1:10){
results = simulation(handPinit, handCinit, packinit)
printWinner(results)
}
#used to max the Qvalue decision
getRowMax = function(tab){
temp = tab[1]
for(i in 2:length(tab)){
if(tab[i] > temp){
temp = tab[i]
}
}
}
#####################################################################
#Q-learning
#####################################################################
#Represent sets of Q(s, a)
Qvalues = matrix(1, nrow = 30, ncol = 2)
simResults = simRand(1000)
#Hyperparameters
alpha = 0.9
discount = 0.1
#for all rows simulated, update qvalues.
for(i in 1:length(simResults[,1])){
st = simResults[i, 4] #st
a = simResults[i, 2] #a
stPlusOne = simResults[i, 1] #st+1
Qvalues[st, a] = Qvalues[st, a] + alpha * ( simResults[i,3] * discount * getRowMax(Qvalues[stPlusOne, ]) - Qvalues[st, a] )
}
#black jack应用强化学习。在这里,我们将假设“蹲伏者”只有一副牌
#初始选项卡
packinit=c(rep(1,4),rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
代表(9,4),代表(10,16))
#在我们的游戏中,为了简化问题,A将始终被视为1。其他数字相当于10。
#若玩家和蹲位者的得分相同,那个么玩家就输了。
#克劳皮尔将抽到17张或更多的牌。
handPinit=NULL#将包含玩家的手
handCinit=NULL#将包含蹲位者的手
列表=列表(handPinit、handCinit、packinit)
#3月月月月月月月月月月月月月月方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方方35月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月月日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日日########
##############################################################################################
#随机整数,返回一个整数来选择卡片
randInt=功能(包){
int=runif(1)*长度(包)
int=int+1
int=圆形(int,0)
返回(整数)
}
#选择一张卡片,asimResults它到想要的手并且从包中删除它。
pickC=功能(手、包){
int=randInt(包)
手=c(手,包[int])
pack=pack[-int]
退货(单子(手、包))
}
分数=功能(handC){
回报(总和(handC,na.rm=T))
}
printWinner=函数(结果列表){
res=resultList[[4]]
p=res[1]
c=res[2]
如果((p>c&p c&p温度){
温度=制表符[i]
}
}
}
#####################################################################
#Q-学习
#####################################################################
#表示Q(s,a)的集合
Qvalues=矩阵(1,nrow=30,ncol=2)
simResults=simRand(1000)
#超参数
α=0.9
折扣=0.1
#对于所有模拟行,更新qvalues。
对于(i in 1:length(simResults[,1])){
st=simResults[i,4]#st
a=simResults[i,2]#a
stPlusOne=simResults[i,1]#st+1
Qvalues[st,a]=Qvalues[st,a]+alpha*(simResults[i,3]*折扣*getRowMax(Qvalues[stPlusOne,])-Qvalues[st,a])
}
正如LucyMLi所指出的:
首先,需要将
return(temp)
对象添加到getRowMax
函数中。
但是您的模拟还有另一个问题,因为
simResults[,1]
中的值是0
,这意味着Qvalues[stPlusOne,]
将
如果为空,则无法计算getRowMax()
把一堆代码放在我们身上,让我们调试它,这不是一种好的形式。你应该去掉与你的特定问题没有直接关系的任何代码,使其最小化。准确地说出你得到的错误,并描述你预期会发生什么。首先你需要将
return(temp)
对象添加到getRowMax()
函数。但是您的模拟还有另一个问题,因为simResults[,1]
中的一些值为0,这意味着Qvalues[stPlusOne,]
将为空,因此您无法计算getRowMax().我不是要求调试,更像是一些关于方向的提示,因为我被卡住了。至于min可复制的示例,我认为这一个已经很小了,你只需要运行最后10行,其余的都是必需的函数,我真的不知道如何在不使其完全独立的情况下使该示例变小没关系。@LucyMLi,谢谢你,事实上,在某些情况下,问题是来自手工创作,我现在设法解决了,而且似乎很有效!多亏了Hanks,我设法解决了它,问题是我使用了round(,0)认为它将只返回整数部分,但它有时会将值舍入到上整数,从而产生错误