Cuda idx无法正确索引矩阵

Cuda idx无法正确索引矩阵,cuda,Cuda,我在cuda中有以下内核: __global__ void pagerank(Node *ingoing, Node *outgoing, int N) { int j; int idx = threadIdx.x + blockIdx.x * blockDim.x; if ((idx > 0) && (idx < N)){ //for(j=0;j<N;j++){ // outgoing[j].p_t1=ingo

我在cuda中有以下内核:

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
   int j;
   int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if ((idx > 0) && (idx < N)){
      //for(j=0;j<N;j++){
      //   outgoing[j].p_t1=ingoing[j].p_t1;  
      //}
      outgoing[idx].p_t1=ingoing[idx].p_t1; 

  }
} 
但当我这样做的时候,他们是正确的

outgoing[idx].p_t1=ingoing[idx].p_t1;
for(j=0;j<N;j++){
   outgoing[j].p_t1=ingoing[j].p_t1;  
}
(j=0;j 0)和&(idx{ 对于(j=0;j当你说main function时,当我这样做时,他们打印0,我假设你引用的是所有条目,而不仅仅是索引0。事实上,索引0不是由你的代码处理的,因为第一个版本是
((idx>0)和&(idx
对于
idx=0

更进一步,在您的代码中,我们缺少
节点
类型的定义。这对于更好地理解代码中可能出现的错误是必需的

根据编译中使用的
节点
的大小、内容和结构打包,主机端的
节点
大小可能不同于设备上的
节点
大小。使用
printf
验证是否有用,或者使用调试器

此外,您似乎没有在启动时检查错误。您肯定希望在内核调用之后添加和,以确保没有发生错误。(cuda运行时API的任何其他方法调用也可能返回代码未检查的错误)

编辑 为了复制,我写了以下内容,尽可能靠近您的代码。我没有足够内存的卡,因此节点数较小

typedef struct 
{
    double p_t0;
    double p_t1;
    double e;
    int To_id[460];
    int con_size;
} Node ;

__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x ; 
    if ((idx > 0) && (idx < N))
        outgoing[idx].p_t1 = ingoing[idx].p_t1;
}

#include <cstdlib>

#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR @ %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } } 

int main()
{
    // int N = 916428 ; // does not fit on my GPU
    int N = 400000 ;

    int blockSize;
    int minGridSize;
    int gridSize;

    Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;

    for (int i = 0 ; i < N ; ++i)
        Nodes[i].p_t1 = (double)i+1;

    Node* h_ingoing = Nodes;
    Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;

    Node* d_ingoing ;
    Node* d_outgoing ;

    cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
    cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));

    cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
    cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));

    float time;

    cudaEvent_t begin, end ;

    //blockSize = 256 ;
    cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
    gridSize = (N + blockSize -1) / blockSize ;

    printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;

    cudaCheck (cudaEventCreate (&begin)) ;
    cudaCheck (cudaEventCreate (&end)) ;

    cudaCheck (cudaEventRecord (begin, 0)) ;

    pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;

    cudaCheck (cudaEventRecord (end, 0)) ;

    cudaCheck (cudaEventSynchronize (end)) ;

    cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;

    for (int i = 0 ; i < 100 ; ++i)
    {
        printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
    }

    for (int i = 0  ; i < N ; ++i)
    {
        if (h_outgoing[i].p_t1 != (double)(i+1))
            printf ("Error @ %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
    }

    return 0 ;
}
typedef结构
{
双p_t0;
双p_t1;
双e;
int至_id[460];
int con_尺寸;
}节点;
__全局无效pagerank(节点*输入,节点*输出,整数N)
{
int idx=threadIdx.x+blockIdx.x*blockDim.x;
如果((idx>0)和&(idx\n”,网格大小,块大小);
cudaCheck(cudaEventCreate(&begin));
cudaCheck(cudaEventCreate(&end));
cudaCheck(cudaEventRecord(begin,0));
pagerank>(d_输入,d_输出,N);
cudaCheck(cudaventrecord(end,0));
cudaCheck(cudaEventSynchronize(end));
cudaCheck(cudaMemcpy(h_传出、d_传出、N*sizeof*h_传出、cudaMemcpyDeviceToHost));
对于(int i=0;i<100;++i)
{
printf(“P_t1[%d]=%f\n”,i,h_传出[i].P_t1);
}
对于(int i=0;i

除了索引0(答案初稿中指出存在问题)之外,每个输出都是正确的。

您会遇到什么错误?我不是一个活的编译器……我相信我理解这个问题,尽管没有调用全局内核的代码,很难知道发生了什么。我编写了整个代码,您现在可以解释一下吗?谢谢我在下面添加了节点描述。这会改变什么吗?节点是一个大型结构。您没有得到任何运行时错误?实际上是节点矩阵的总大小(有916428个节点)是1.7 Gbit。我在特斯拉k20m上运行。不,我没有收到任何错误,至少没有调试工具。我更新了回复文章,试图重现。你最后的评论是什么意思?如果没有调试工具,我不会收到任何错误。如cuda memcheck。我将稍后使用调试工具进行尝试,并让您知道
/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"

/******************** Defines ****************/
// Number of nodes
int N;

// Convergence threashold and algorithm's parameter d  
double threshold, d;

// Table of node's data
Node *Nodes;

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
       int j;
       int idx = threadIdx.x + blockIdx.x * blockDim.x; 
        if ((idx > 0) && (idx < N)){
          for(j=0;j<N;j++){
             outgoing[j].p_t1=ingoing[j].p_t1;  
          }
          //outgoing[idx].p_t1=ingoing[idx].p_t1; 

      }
    } 
/***** Read graph connections from txt file *****/  

void Read_from_txt_file(char* filename)
{

FILE *fid;

int from_idx, to_idx;
int temp_size;

fid = fopen(filename, "r");
if (fid == NULL){
   printf("Error opening data file\n");
}

while (!feof(fid))
{

  if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
  {
     Nodes[from_idx].con_size++;
     temp_size = Nodes[from_idx].con_size;
     //Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
     Nodes[from_idx].To_id[temp_size - 1] = to_idx;
   }
}

//printf("End of connections insertion!\n");

fclose(fid);

 }

/***** Read P vector from txt file*****/    

void Read_P_from_txt_file()
{

FILE *fid;
double temp_P;
int index = 0;

fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}

while (!feof(fid))
{
  // P's values are double!
  if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
  {
     Nodes[index].p_t1 = temp_P;
     index++;   
  }
}
//printf("End of P insertion!");

fclose(fid);    

}


/***** Read E vector from txt file*****/    

void Read_E_from_txt_file()
{

FILE *fid;
double temp_E;
int index = 0;

fid = fopen("E.txt", "r");
if (fid == NULL)
  printf("Error opening the E file\n");

while (!feof(fid))
{
  // E's values are double!
  if (fscanf(fid,"%lf\n", &temp_E))
  {
     Nodes[index].e = temp_E;
     index++;   
  }
}
//printf("End of E insertion!");

fclose(fid);    

}

/***** Create P and E with equal probability *****/

void Random_P_E()
{

int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
 // Sum of E (it must be =1)
double sum_E_1 = 0; 

// Arrays initialization
for (i = 0; i < N; i++)
{
  Nodes[i].p_t0 = 0;
  Nodes[i].p_t1 = 1;
  Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;

  sum_P_1 = sum_P_1 + Nodes[i].p_t1;

  Nodes[i].e = 1;
  Nodes[i].e = (double) Nodes[i].e / N;
  sum_E_1 = sum_E_1 + Nodes[i].e;
}

// Assert sum of probabilities is =1

// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);

// Exit if sum of P is !=1
assert(sum_P_1 = 1);

//printf("\n");

// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);

// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);

}


/***** Main function *****/   

int main(int argc, char** argv)
{

int blockSize;      // The launch configurator returned block size 
int minGridSize;    // The minimum grid size needed to achieve the maximum occupancy for a full device launch 
int gridSize;       // The actual grid size needed, based on input size 

// Check input arguments
if (argc < 5)
{
  printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
  return 0;
} 

// get arguments 
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);

int i;


// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability


// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));

for (i = 0; i < N; i++)
{
   Nodes[i].con_size = 0;
   //Nodes[i].To_id = (int*) malloc(sizeof(int));
}

Read_from_txt_file(filename);

// set random probabilities
Random_P_E();


Node *h_ingoing;

Node *h_outgoing;

h_ingoing = Nodes;

h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);

Node *d_ingoing;

Node *d_outgoing;

cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);

cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);

cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);

cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);

float time;

cudaEvent_t begin, end;

cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N); 

// Round up according to array size 
gridSize = (N + blockSize - 1) / blockSize; 
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);

cudaEventCreate(&begin);

cudaEventCreate(&end);
cudaEventRecord(begin, 0);

pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);

cudaEventRecord(end, 0);


cudaEventSynchronize(end);


cudaEventElapsedTime(&time, begin, end);

cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);

printf("%f\n", time) ;



printf("\n");

// Print final probabilitities
for (i = 0; i <100; i++)
{
  printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");



printf("End of program!\n");

return (EXIT_SUCCESS);
}
typedef struct 
{
    double p_t0;
    double p_t1;
    double e;
    int To_id[460];
    int con_size;
} Node ;

__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x ; 
    if ((idx > 0) && (idx < N))
        outgoing[idx].p_t1 = ingoing[idx].p_t1;
}

#include <cstdlib>

#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR @ %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } } 

int main()
{
    // int N = 916428 ; // does not fit on my GPU
    int N = 400000 ;

    int blockSize;
    int minGridSize;
    int gridSize;

    Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;

    for (int i = 0 ; i < N ; ++i)
        Nodes[i].p_t1 = (double)i+1;

    Node* h_ingoing = Nodes;
    Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;

    Node* d_ingoing ;
    Node* d_outgoing ;

    cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
    cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));

    cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
    cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));

    float time;

    cudaEvent_t begin, end ;

    //blockSize = 256 ;
    cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
    gridSize = (N + blockSize -1) / blockSize ;

    printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;

    cudaCheck (cudaEventCreate (&begin)) ;
    cudaCheck (cudaEventCreate (&end)) ;

    cudaCheck (cudaEventRecord (begin, 0)) ;

    pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;

    cudaCheck (cudaEventRecord (end, 0)) ;

    cudaCheck (cudaEventSynchronize (end)) ;

    cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;

    for (int i = 0 ; i < 100 ; ++i)
    {
        printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
    }

    for (int i = 0  ; i < N ; ++i)
    {
        if (h_outgoing[i].p_t1 != (double)(i+1))
            printf ("Error @ %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
    }

    return 0 ;
}