cudaMemcpy返回成功,但不复制任何内容

cudaMemcpy返回成功,但不复制任何内容,cuda,Cuda,以下是我与cuda gdb核实的情况: src的内容是正确的 cudaMalloc、malloc和文件I/O成功 cudaMemcpy返回cudaSuccess 调用有问题的cudaMemcpy,不会抛出错误或异常 已成功分配目标(Cudamaloc) 下面是代码的相关部分:wavenet\u server.ccmallocs源代码,将数据从文件复制到源代码,并调用make\u wavenetwavenet\u expert.cu调用MyWaveNet的构造函数并调用setEmbeddings

以下是我与cuda gdb核实的情况:

  • src的内容是正确的
  • cudaMalloc、malloc和文件I/O成功
  • cudaMemcpy返回cudaSuccess
  • 调用有问题的cudaMemcpy,不会抛出错误或异常
  • 已成功分配目标(Cudamaloc)
  • 下面是代码的相关部分:
    wavenet\u server.cc
    mallocs源代码,将数据从文件复制到源代码,并调用
    make\u wavenet
    wavenet\u expert.cu
    调用
    MyWaveNet
    的构造函数并调用
    setEmbeddings

    wavenet_server.cc:

    #include "wavenet_infer.h"
    void readArrayFromBinary(void* array, size_t len, size_t num_bytes_per_elem, const char* file_name) {
      FILE* file = fopen(file_name, "rb");
      fread(array, num_bytes_per_elem, len, file);
      fclose(file);
    }
    
    void setEmbeddingCurr(const char* fileName,  size_t len) {
          this->embedding_curr = (float*)malloc(sizeof(float) * len);
          readArrayFromBinary((void*)this->embedding_curr, len, sizeof(float), fileName);
        }
    
    void setWavenet(void) {
          this->wavenet = make_wavenet(this->num_samples,
                                      this->batch_size,
                                      this->embedding_prev,
                                      this->embedding_curr,
                                      this->num_layers,
                                      this->max_dilation,
                                      this->dilate_weights_prev,
                                      this->dilate_weights_curr,
                                      this->dilate_biases,
                                      this->res_weights,
                                      this->res_biases,
                                      this->skip_weights,
                                      this->skip_biases,
                                      this->conv_out,
                                      this->conv_end,
                                      this->is_using_embed_tanh,
                                      this->implementation);
        }
    
    wavenet_expert.cu:

    #include "nv_wavenet.cuh"
    typedef nvWavenetInfer<float,float, R, S, A> MyWaveNet;
        void* make_wavenet(int sample_count,
                                           int batch_size,
                                           float* embedding_prev,
                                           float* embedding_curr,
                                           int num_layers,
                                           int max_dilation,
                                           float** in_layer_weights_prev,
                                           float** in_layer_weights_curr,
                                           float** in_layer_biases,
                                           float** res_layer_weights,
                                           float** res_layer_biases,
                                           float** skip_layer_weights,
                                           float** skip_layer_biases,
                                           float* conv_out_weight,
                                           float* conv_end_weight,
                                           bool use_embed_tanh,
                                           int implementation
                                           ) {
        MyWaveNet* wavenet = new MyWaveNet(num_layers, max_dilation, batch_size, sample_count,
                                                                           implementation, use_embed_tanh);
    
        wavenet->setEmbeddings(embedding_prev, embedding_curr);
    
        // We didn't use biases on our outputs
        std::vector<float> dummy_bias_first(S, 0);
        std::vector<float> dummy_bias_second(A, 0);
    
        wavenet->setOutWeights(conv_out_weight,
                               dummy_bias_first.data(),
                               conv_end_weight,
                               dummy_bias_second.data());
    
        for (int l = 0; l < num_layers; l++) {
            wavenet->setLayerWeights(l, in_layer_weights_prev[l],
                                        in_layer_weights_curr[l],
                                        in_layer_biases[l],
                                        res_layer_weights[l],
                                        res_layer_biases[l],
                                        skip_layer_weights[l],
                                        skip_layer_biases[l]);
        }
    
        return (void*)wavenet;
    }
    

    结果证明cudaMemcpy不是问题所在。使用cuda gdb检查设备全局内存时,无法执行:
    x/10fw float_数组
    。它将给出不正确的值。要查看,请尝试以下内容:
    p((@global float*)float_array)[0]@10

    Isaac,请记住几分钟后回来接受此内容days@talonmies嗨,塔龙们,谢谢你们一如既往的评论。我注意到你对我问题的反馈。因为我是新手,我不太清楚我错过了什么。你能分享一下你的想法吗?我怎样才能把它们做得更好?再次感谢
    nvWavenetInfer (int numLayers, int maxDilation, int batchSize, int numSamples, int impl=0, bool tanhEmbed=true) : m_numLayers(numLayers), m_maxBatch(batchSize), m_maxSamples(numSamples), m_implementation((nvWavenetInfer::Implementation)impl), m_tanhEmbed(tanhEmbed) {
    
    
                m_maxDilation = maxDilation;
    
                /*
                gpuErrChk(cudaMalloc(&m_yOut, numSamples*batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
                gpuErrChk(cudaMemset(m_yOut, 0, numSamples*batchSize*sizeof(int)));
                */
                gpuErrChk(cudaMalloc(&m_outputSelectors, numSamples*batchSize*sizeof(float)));
    
                gpuErrChk(cudaMalloc(&m_embedPrev, A*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_embedCur, A*R*sizeof(T_data)));
    
                gpuErrChk(cudaMalloc(&m_Wprev, numLayers*2*R*R*sizeof(T_weight)));
                gpuErrChk(cudaMalloc(&m_Wcur, numLayers*2*R*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Bh, numLayers*2*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Lh, numSamples*numLayers*batchSize*2*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Wres, numLayers*R*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Bres, numLayers*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Wskip, numLayers*S*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Bskip, numLayers*S*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_XtOut, numLayers*R*batchSize*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_skipOut, numLayers*S*batchSize*sizeof(T_data)));
    
                // For now, just burn memory as though all layers had the maximum dilation value
                gpuErrChk(cudaMalloc(&m_XtIn, (m_maxDilation+1)*(numLayers+1)*R*batchSize*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_hOut, numLayers*batchSize*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_aPrev, numLayers*batchSize*2*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_skipIn, numLayers*S*batchSize*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_yInPrev, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
                gpuErrChk(cudaMalloc(&m_yInCur, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
    
                gpuErrChk(cudaMalloc(&m_WskipOut, A*S*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_BskipOut, A*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Wout, A*A*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_Bout, A*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_skipOutFinal, A*batchSize*S/R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_out, A*batchSize*A/R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_p, A*batchSize*sizeof(T_data)));
    
                gpuErrChk(cudaMalloc(&m_h, numLayers*batchSize*R*sizeof(T_data)));
                gpuErrChk(cudaMalloc(&m_hSample, numLayers*batchSize*sizeof(int)));
                gpuErrChk(cudaMalloc(&m_ySample, batchSize*sizeof(int)));
    
                if (impl == PERSISTENT) {
                    gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
                    gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
                }
    
            }
    virtual void setEmbeddings (float* embedPrev, float* embedCur) {
                setActivation(m_embedPrev, embedPrev, A*R);
                setActivation(m_embedCur, embedCur, A*R);
            }
    void setActivation(float* dst, float* src, size_t size) {
                gpuErrChk(cudaMemcpy(dst, src, size*sizeof(float), cudaMemcpyHostToDevice));
            }