Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/311.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java中玩具神经网络的DQN学习算法_Java - Fatal编程技术网

Java中玩具神经网络的DQN学习算法

Java中玩具神经网络的DQN学习算法,java,Java,我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现(根据作业要求)。我知道这可能需要很长时间,但我想断言代码是正确的,这样我就不会浪费时间用错误的算法训练网络 当我运行此程序时,代理的性能比随机行走者差,我真的不知所措。任何帮助都将不胜感激 网络似乎更喜欢“禁止”的行为,例如不在坑中攀爬,在没有金子的地方捡金子 代码如下: 建造师 选择行动 /** *做一个epsilon delta动作,要么是随机动作,要么由网络决定。

我正在为一个Wumpus游戏写一个DQN代理。我用这个来实现它。我还使用目标/策略网络组合。我决定使用我自己的玩具神经网络实现(根据作业要求)。我知道这可能需要很长时间,但我想断言代码是正确的,这样我就不会浪费时间用错误的算法训练网络

当我运行此程序时,代理的性能比随机行走者差,我真的不知所措。任何帮助都将不胜感激

网络似乎更喜欢“禁止”的行为,例如不在坑中攀爬,在没有金子的地方捡金子

代码如下:

建造师

选择行动

/**
*做一个epsilon delta动作,要么是随机动作,要么由网络决定。
*
*@返回epsilon delta操作的字符串表示形式。
*/
私有字符串chooseAction(){
if(r.nextDouble()
最后是学习:

/**
 * Train the network with a sample of the memory.
 */
private void train() {
    if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
        return; // Nothing to train yet.
    }

    List<Transition> batch = this.memory.sample();

    List<NetworkInput<SMatrix>> training = new ArrayList<>();
    // Bellman fitting
    for (Transition b : batch) {
        training.add(transitionToNetworkInput(b));
    }

    this.policyNetwork.train(training, 1);
}
/**
*使用内存样本训练网络。
*/
私家车(){
if(this.memory.size()
TransitionOnNetworkInput方法定义为:

/**
 * Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
 * network to train on. Uses the Bellman equation to perform Q value fitting.
 *
 * @param t Transition from sample
 *
 * @return network input (data (old state), label (q values))
 */
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
    // Raw data from transition.
    String action = t.action;
    int actionIndex = ACTION_MAP.get(action); // possible actions for this game
    double reward = t.reward; // score of the game at this state.
    World prevState = t.prevState;
    World newState = t.newState;

    // Predict Q values of old state with policy network (as done in doAction())
    Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));

    // Predict future Q values of the newState, i.e., what action WOULD have maximised score.
    Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));

    // Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
    double[][] previousQValues = previousStatePrediction.rawCopy();
    previousQValues[actionIndex][0]
        = t.isDone ? reward
        : (reward + DISCOUNT_FACTOR * newStatePrediction.max());
    
    return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}
/**
*简单的映射器,将转换作为输入,并为神经网络输出网络输入
*网络训练。使用Bellman方程进行Q值拟合。
*
*@param t从样本转换
*
*@返回网络输入(数据(旧状态)、标签(q值))
*/
专用网络输入转换网络输入(转换t){
//来自转换的原始数据。
字符串动作=t动作;
int actionIndex=ACTION_MAP.get(ACTION);//此游戏可能的操作
double reward=t.reward;//此状态下的游戏分数。
世界领先国家=t.领先国家;
世界新闻状态=t.newState;
//使用策略网络预测旧状态的Q值(如doAction()中所做)
矩阵previousStatePrediction=this.policyNetwork.predict(编码(prevState));
//预测newState未来的Q值,即,什么样的操作会使得分最大化。
矩阵newStatePrediction=this.targetNetwork.predict(编码(newState));
//将所采取行动的值设置为奖励或奖励+折扣因子*未来q状态的最大值。
double[]previousQValues=previousStatePrediction.rawCopy();
以前的QValues[actionIndex][0]
=t.isDone?奖励
:(奖励+折扣系数*newStatePrediction.max());
返回新的NetworkInput(编码(prevState)、新的SimpleMatrix(previousQValues));
}
/**
 * Do an epsilon-delta action, either a random action or let the network decide.
 *
 * @return String representation of an epsilon-delta action.
 */
private String chooseAction() {
    if (r.nextDouble() < this.epsilon) {
        this.epsilon -= DELTA_EPSILON;
        this.epsilon = Math.max(this.epsilon, MIN_EPSILON);
        return ACTIONS[r.nextInt(ACTIONS_SIZE)];
    } else {
        Matrix<SMatrix> prediction = this.policyNetwork.predict(encoding(this.state));
        int argMax = prediction.argMax();
        return ACTIONS[argMax];
    }
}
/**
 * Train the network with a sample of the memory.
 */
private void train() {
    if (this.memory.size() < MEMORY_SAMPLE_SIZE) {
        return; // Nothing to train yet.
    }

    List<Transition> batch = this.memory.sample();

    List<NetworkInput<SMatrix>> training = new ArrayList<>();
    // Bellman fitting
    for (Transition b : batch) {
        training.add(transitionToNetworkInput(b));
    }

    this.policyNetwork.train(training, 1);
}
/**
 * Simple mapper which takes as input a Transition and outputs a NetworkInput for the neural
 * network to train on. Uses the Bellman equation to perform Q value fitting.
 *
 * @param t Transition from sample
 *
 * @return network input (data (old state), label (q values))
 */
private NetworkInput<SMatrix> transitionToNetworkInput(Transition t) {
    // Raw data from transition.
    String action = t.action;
    int actionIndex = ACTION_MAP.get(action); // possible actions for this game
    double reward = t.reward; // score of the game at this state.
    World prevState = t.prevState;
    World newState = t.newState;

    // Predict Q values of old state with policy network (as done in doAction())
    Matrix<SMatrix> previousStatePrediction = this.policyNetwork.predict(encoding(prevState));

    // Predict future Q values of the newState, i.e., what action WOULD have maximised score.
    Matrix<SMatrix> newStatePrediction = this.targetNetwork.predict(encoding(newState));

    // Set the value of the action taken to be either the reward or the reward + discount factor * max of future q states.
    double[][] previousQValues = previousStatePrediction.rawCopy();
    previousQValues[actionIndex][0]
        = t.isDone ? reward
        : (reward + DISCOUNT_FACTOR * newStatePrediction.max());
    
    return new NetworkInput<>(encoding(prevState), new SimpleMatrix(previousQValues));
}