Java中玩具神经网络的DQN学习算法_Java

Java中玩具神经网络的DQN学习算法

java

Java中玩具神经网络的DQN学习算法,java,Java,我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现（根据作业要求）。我知道这可能需要很长时间，但我想断言代码是正确的，这样我就不会浪费时间用错误的算法训练网络当我运行此程序时，代理的性能比随机行走者差，我真的不知所措。任何帮助都将不胜感激网络似乎更喜欢“禁止”的行为，例如不在坑中攀爬，在没有金子的地方捡金子代码如下：建造师选择行动 /** *做一个epsilon delta动作，要么是随机动作，要么由网络决定。

我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现（根据作业要求）。我知道这可能需要很长时间，但我想断言代码是正确的，这样我就不会浪费时间用错误的算法训练网络

当我运行此程序时，代理的性能比随机行走者差，我真的不知所措。任何帮助都将不胜感激

网络似乎更喜欢“禁止”的行为，例如不在坑中攀爬，在没有金子的地方捡金子

代码如下：

建造师

选择行动

/**
*做一个epsilon delta动作，要么是随机动作，要么由网络决定。
*
*@返回epsilon delta操作的字符串表示形式。
*/
私有字符串chooseAction（）{
if（r.nextDouble（）


最后是学习：
/**
 * Train the network with a sample of the memory.
 */
private void train() {
    if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
        return; // Nothing to train yet.
    }

    List<Transition> batch = this.memory.sample();

    List<NetworkInput<SMatrix>> training = new ArrayList<>();
    // Bellman fitting
    for (Transition b : batch) {
        training.add(transitionToNetworkInput(b));
    }

    this.policyNetwork.train(training, 1);
}

/**
*使用内存样本训练网络。
*/
私家车（）{
if（this.memory.size（）

TransitionOnNetworkInput方法定义为：
/**
 * Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
 * network to train on. Uses the Bellman equation to perform Q value fitting.
 *
 * @param t Transition from sample
 *
 * @return network input (data (old state), label (q values))
 */
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
    // Raw data from transition.
    String action = t.action;
    int actionIndex = ACTION_MAP.get(action); // possible actions for this game
    double reward = t.reward; // score of the game at this state.
    World prevState = t.prevState;
    World newState = t.newState;

    // Predict Q values of old state with policy network (as done in doAction())
    Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));

    // Predict future Q values of the newState, i.e., what action WOULD have maximised score.
    Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));

    // Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
    double[][] previousQValues = previousStatePrediction.rawCopy();
    previousQValues[actionIndex][0]
        = t.isDone ? reward
        : (reward + DISCOUNT_FACTOR * newStatePrediction.max());
    
    return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}

/**
*简单的映射器，将转换作为输入，并为神经网络输出网络输入
*网络训练。使用Bellman方程进行Q值拟合。
*
*@param t从样本转换
*
*@返回网络输入（数据（旧状态）、标签（q值））
*/
专用网络输入转换网络输入（转换t）{
//来自转换的原始数据。
字符串动作=t动作；
int actionIndex=ACTION_MAP.get（ACTION）；//此游戏可能的操作
double reward=t.reward；//此状态下的游戏分数。
世界领先国家=t.领先国家；
世界新闻状态=t.newState；
//使用策略网络预测旧状态的Q值（如doAction（）中所做）
矩阵previousStatePrediction=this.policyNetwork.predict（编码（prevState））；
//预测newState未来的Q值，即，什么样的操作会使得分最大化。
矩阵newStatePrediction=this.targetNetwork.predict（编码（newState））；
//将所采取行动的值设置为奖励或奖励+折扣因子*未来q状态的最大值。
double[]previousQValues=previousStatePrediction.rawCopy（）；
以前的QValues[actionIndex][0]
=t.isDone？奖励
：（奖励+折扣系数*newStatePrediction.max（））；
返回新的NetworkInput（编码（prevState）、新的SimpleMatrix（previousQValues））；
}

/**
 * Do an epsilon-delta action, either a random action or let the network decide.
 *
 * @return String representation of an epsilon-delta action.
 */
private String chooseAction() {
    if (r.nextDouble() < this.epsilon) {
        this.epsilon -= DELTA_EPSILON;
        this.epsilon = Math.max(this.epsilon, MIN_EPSILON);
        return ACTIONS[r.nextInt(ACTIONS_SIZE)];
    } else {
        Matrix<SMatrix> prediction = this.policyNetwork.predict(encoding(this.state));
        int argMax = prediction.argMax();
        return ACTIONS[argMax];
    }
}

/**
 * Train the network with a sample of the memory.
 */
private void train() {
    if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
        return; // Nothing to train yet.
    }

    List<Transition> batch = this.memory.sample();

    List<NetworkInput<SMatrix>> training = new ArrayList<>();
    // Bellman fitting
    for (Transition b : batch) {
        training.add(transitionToNetworkInput(b));
    }

    this.policyNetwork.train(training, 1);
}

/**
 * Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
 * network to train on. Uses the Bellman equation to perform Q value fitting.
 *
 * @param t Transition from sample
 *
 * @return network input (data (old state), label (q values))
 */
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
    // Raw data from transition.
    String action = t.action;
    int actionIndex = ACTION_MAP.get(action); // possible actions for this game
    double reward = t.reward; // score of the game at this state.
    World prevState = t.prevState;
    World newState = t.newState;

    // Predict Q values of old state with policy network (as done in doAction())
    Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));

    // Predict future Q values of the newState, i.e., what action WOULD have maximised score.
    Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));

    // Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
    double[][] previousQValues = previousStatePrediction.rawCopy();
    previousQValues[actionIndex][0]
        = t.isDone ? reward
        : (reward + DISCOUNT_FACTOR * newStatePrediction.max());
    
    return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}