Java中玩具神经网络的DQN学习算法
我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现(根据作业要求)。我知道这可能需要很长时间,但我想断言代码是正确的,这样我就不会浪费时间用错误的算法训练网络 当我运行此程序时,代理的性能比随机行走者差,我真的不知所措。任何帮助都将不胜感激 网络似乎更喜欢“禁止”的行为,例如不在坑中攀爬,在没有金子的地方捡金子 代码如下: 建造师 选择行动Java中玩具神经网络的DQN学习算法,java,Java,我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现(根据作业要求)。我知道这可能需要很长时间,但我想断言代码是正确的,这样我就不会浪费时间用错误的算法训练网络 当我运行此程序时,代理的性能比随机行走者差,我真的不知所措。任何帮助都将不胜感激 网络似乎更喜欢“禁止”的行为,例如不在坑中攀爬,在没有金子的地方捡金子 代码如下: 建造师 选择行动 /** *做一个epsilon delta动作,要么是随机动作,要么由网络决定。
/**
*做一个epsilon delta动作,要么是随机动作,要么由网络决定。
*
*@返回epsilon delta操作的字符串表示形式。
*/
私有字符串chooseAction(){
if(r.nextDouble()
最后是学习:
/**
* Train the network with a sample of the memory.
*/
private void train() {
if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
return; // Nothing to train yet.
}
List<Transition> batch = this.memory.sample();
List<NetworkInput<SMatrix>> training = new ArrayList<>();
// Bellman fitting
for (Transition b : batch) {
training.add(transitionToNetworkInput(b));
}
this.policyNetwork.train(training, 1);
}
/**
*使用内存样本训练网络。
*/
私家车(){
if(this.memory.size()
TransitionOnNetworkInput方法定义为:
/**
* Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
* network to train on. Uses the Bellman equation to perform Q value fitting.
*
* @param t Transition from sample
*
* @return network input (data (old state), label (q values))
*/
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
// Raw data from transition.
String action = t.action;
int actionIndex = ACTION_MAP.get(action); // possible actions for this game
double reward = t.reward; // score of the game at this state.
World prevState = t.prevState;
World newState = t.newState;
// Predict Q values of old state with policy network (as done in doAction())
Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));
// Predict future Q values of the newState, i.e., what action WOULD have maximised score.
Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));
// Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
double[][] previousQValues = previousStatePrediction.rawCopy();
previousQValues[actionIndex][0]
= t.isDone ? reward
: (reward + DISCOUNT_FACTOR * newStatePrediction.max());
return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}
/**
*简单的映射器,将转换作为输入,并为神经网络输出网络输入
*网络训练。使用Bellman方程进行Q值拟合。
*
*@param t从样本转换
*
*@返回网络输入(数据(旧状态)、标签(q值))
*/
专用网络输入转换网络输入(转换t){
//来自转换的原始数据。
字符串动作=t动作;
int actionIndex=ACTION_MAP.get(ACTION);//此游戏可能的操作
double reward=t.reward;//此状态下的游戏分数。
世界领先国家=t.领先国家;
世界新闻状态=t.newState;
//使用策略网络预测旧状态的Q值(如doAction()中所做)
矩阵previousStatePrediction=this.policyNetwork.predict(编码(prevState));
//预测newState未来的Q值,即,什么样的操作会使得分最大化。
矩阵newStatePrediction=this.targetNetwork.predict(编码(newState));
//将所采取行动的值设置为奖励或奖励+折扣因子*未来q状态的最大值。
double[]previousQValues=previousStatePrediction.rawCopy();
以前的QValues[actionIndex][0]
=t.isDone?奖励
:(奖励+折扣系数*newStatePrediction.max());
返回新的NetworkInput(编码(prevState)、新的SimpleMatrix(previousQValues));
}
/**
* Do an epsilon-delta action, either a random action or let the network decide.
*
* @return String representation of an epsilon-delta action.
*/
private String chooseAction() {
if (r.nextDouble() < this.epsilon) {
this.epsilon -= DELTA_EPSILON;
this.epsilon = Math.max(this.epsilon, MIN_EPSILON);
return ACTIONS[r.nextInt(ACTIONS_SIZE)];
} else {
Matrix<SMatrix> prediction = this.policyNetwork.predict(encoding(this.state));
int argMax = prediction.argMax();
return ACTIONS[argMax];
}
}
/**
* Train the network with a sample of the memory.
*/
private void train() {
if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
return; // Nothing to train yet.
}
List<Transition> batch = this.memory.sample();
List<NetworkInput<SMatrix>> training = new ArrayList<>();
// Bellman fitting
for (Transition b : batch) {
training.add(transitionToNetworkInput(b));
}
this.policyNetwork.train(training, 1);
}
/**
* Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
* network to train on. Uses the Bellman equation to perform Q value fitting.
*
* @param t Transition from sample
*
* @return network input (data (old state), label (q values))
*/
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
// Raw data from transition.
String action = t.action;
int actionIndex = ACTION_MAP.get(action); // possible actions for this game
double reward = t.reward; // score of the game at this state.
World prevState = t.prevState;
World newState = t.newState;
// Predict Q values of old state with policy network (as done in doAction())
Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));
// Predict future Q values of the newState, i.e., what action WOULD have maximised score.
Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));
// Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
double[][] previousQValues = previousStatePrediction.rawCopy();
previousQValues[actionIndex][0]
= t.isDone ? reward
: (reward + DISCOUNT_FACTOR * newStatePrediction.max());
return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}