Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/xpath/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/amazon-s3/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
两点的cuda计算距离_Cuda - Fatal编程技术网

两点的cuda计算距离

两点的cuda计算距离,cuda,Cuda,这里我想计算每两点的距离,并决定它们是否是邻居。这是我在cuda中的简单代码 __global__ void calcNeighbors(const DataPoint* points, const float doubleRadius, bool* neighbors) { int tid = threadIdx.x + blockIdx.x * blockDim.x; float dis = 0.0f; while (tid < N) { DataPoint p1 = p

这里我想计算每两点的距离,并决定它们是否是邻居。这是我在cuda中的简单代码

__global__ void calcNeighbors(const DataPoint* points,
  const float doubleRadius, bool* neighbors) {

int tid = threadIdx.x + blockIdx.x * blockDim.x;

float dis = 0.0f;
while (tid < N) {
   DataPoint p1 = points[tid];
   for (int i=0; i<N; i++) {
       DataPoint p2 = points[i];
       dis = 0;
       dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
           (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
           (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
       if (dis <= doubleRadius) {
           neighbors[tid*N+i] = true;
       } else {
           neighbors[tid*N+i] = false;
       }
   }   

   tid += blockDim.x * gridDim.x;
}
}
所以在这里我想减少时间,我该怎么办?我曾尝试使用内存合并和共享内存,但没有得到很好的加速

===================使用共享内存==============

__global__ void calcNeighbors2(const DataPoint* points, 
  const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];

int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
    sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();

int tid = threadIdx.x;
float dis;
while (tid < N) {
    DataPoint p1 = points[tid];
    for (int i=start; i<len; i++) {
       dis = 0;
       dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) + 
           (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) + 
           (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
       if (dis <= doubleRadius) {
           neighbors[i*N+tid] = true;
       } else {
           neighbors[i*N+tid] = false;
       }

    }

    tid += blockDim.x;
}
}
\uuuuu全局\uuuuuu无效calcNeighbors2(常数数据点*点,
常量浮点双半径,布尔*邻域){
__共享数据点共享点[threadsPerBlock];
int start=blockIdx.x*blockDim.x;
int len=start+threadIdx.x;
if(len对于(int i=开始;i至少有4个想法,其中一些已经在评论中陈述:

  • 从AoS格式转换点距离存储:

    struct DataPoint {
      float pfDimens[3];
    };
    
    到SoA格式:

    struct DataPoint {
      float pfDimens_x[NPTS];
      float pfDimens_y[NPTS];
      float pfDimens_z[NPTS];
    };
    
    这将在加载数据时启用完全合并。事实上,为了帮助解决下面的第4点,我将切换到使用3个裸数组,而不是一个结构

  • 将计算减少到(略小于)一半:

    然后,无论是在线程代码本身中,还是在主机中,都可以通过复制数据来填充输出矩阵的另一半

  • 在输出矩阵中转置存储,以便编写如下存储操作:

    neighbors[i*N+tid] = true;
    
    这将很好地结合在一起,与此相反:

    neighbors[tid*N+i] = true;
    
    这是不可能的

  • 由于您的输入点数据是只读的,请适当地标记内核参数:

    const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
    
    在某些情况下,在某些GPU上,这通常会由于使用而导致速度加快。如果您真的想积极地进行缓存,并且您的数据数组足够小(4K或更小
    浮点
    点),您可以将点数据的副本放在全局内存中,也可以将副本放在
    \uuuuuuu常量
    内存中,然后加载“统一”通过恒定内存加载您正在执行的操作:

    DataPoint p2 = c_points[i];
    
    因此,您可以通过只读缓存执行合并加载,通过常量缓存执行统一加载,并将合并存储转到普通全局内存

  • 在K40c上,在linux/CUDA 7上,对于
    N
    =4096,在内核级别,这些更改的净效果似乎是大约3.5倍的加速:

    $ cat t749.cu
    #include <stdio.h>
    
    #define N 4096
    // if N is 16K/3 or less, we can use constant
    #define USE_CONSTANT
    #define THRESH 0.2f
    #define nTPB 256
    #define nBLK (N/nTPB+1)
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    struct DataPoint {
        float pfDimens[3];
    };
    
    
    __global__ void calcNeighbors(const DataPoint* points,
      const float doubleRadius, bool* neighbors) {
    
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
      float dis = 0.0f;
      while (tid < N) {
       DataPoint p1 = points[tid];
       for (int i=0; i<N; i++) {
           DataPoint p2 = points[i];
           dis = 0;
           dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
               (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
               (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
           if (dis <= doubleRadius) {
               neighbors[tid*N+i] = true;
           } else {
               neighbors[tid*N+i] = false;
           }
       }
    
       tid += blockDim.x * gridDim.x;
      }
    }
    
    #ifdef USE_CONSTANT
    __constant__ float cpx[N];
    __constant__ float cpy[N];
    __constant__ float cpz[N];
    #endif
    
    __global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
    
      int tid = threadIdx.x+blockDim.x*blockIdx.x;
    
      while (tid < N) {
        float p1x = pts_x[tid];
        float p1y = pts_y[tid];
        float p1z = pts_z[tid];
        for (int i = N-1; i > tid; i--){
          float p2x, p2y, p2z;
    #ifdef USE_CONSTANT
          p2x = cpx[i];
          p2y = cpy[i];
          p2z = cpz[i];
    #else
          p2x = pts_x[i];
          p2y = pts_y[i];
          p2z = pts_z[i];
    #endif
          float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
          neighbors[i*N+tid] = (dis <= doubleRadius);
          }
        tid += blockDim.x * gridDim.x;
      }
    }
    
    int main(){
    
      float *dx, *dy, *dz, *hx, *hy, *hz;
      DataPoint *dp, *hp;
      bool *dn, *hn1, *hn2;
      hx =(float *)malloc(N*sizeof(float));
      hy =(float *)malloc(N*sizeof(float));
      hz =(float *)malloc(N*sizeof(float));
      hp =(DataPoint *)malloc(N*sizeof(DataPoint));
      hn1=(bool *)malloc(N*N*sizeof(bool));
      hn2=(bool *)malloc(N*N*sizeof(bool));
      cudaMalloc(&dx, N*sizeof(float));
      cudaMalloc(&dy, N*sizeof(float));
      cudaMalloc(&dz, N*sizeof(float));
      cudaMalloc(&dp, N*sizeof(DataPoint));
      cudaMalloc(&dn, N*N*sizeof(bool));
    
      for (int i =0; i < N; i++){
        hx[i] = rand()/(float)RAND_MAX;
        hy[i] = rand()/(float)RAND_MAX;
        hz[i] = rand()/(float)RAND_MAX;
        hp[i].pfDimens[0] = hx[i];
        hp[i].pfDimens[1] = hy[i];
        hp[i].pfDimens[2] = hz[i];}
    
      cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
    
      // warm-up
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t1 = dtime_usec(0);
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 1 error");
      t1 = dtime_usec(t1);
      cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      // warm-up
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t2 = dtime_usec(0);
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 2 error");
      t2 = dtime_usec(t2);
      cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      cudaCheckErrors("some error");
      printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
      // results validation
      for (int i = 0; i < N; i++)
        for (int j = i+1; j < N; j++)
          if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
      return 0;
    }
    $ nvcc -arch=sm_35 -o t749 t749.cu
    $ ./t749
    t1: 0.004903s, t2: 0.001395s
    $
    
    产生的~48个块足以大致“填充”具有15个SMs的K40c

    编辑:既然您发布了一个共享内存内核,我将它作为
    calcNeighbors3
    添加到我的测试用例中,并比较了它的计时性能(如
    t3
    )。它几乎和我的内核一样快,而且似乎提供了正确的结果(与原始内核匹配),所以我不确定您关心的是什么

    以下是更新的代码和测试用例:

    $ cat t749.cu
    #include <stdio.h>
    #include <math.h>
    
    #define imin(X,Y) ((X)<(Y))?(X):(Y)
    
    #define N 32768
    // if N is 16K/3 or less, we can use constant
    // #define USE_CONSTANT
    #define THRESH 0.2f
    #define nTPB 256
    #define nBLK (N/nTPB+1)
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    struct DataPoint {
        float pfDimens[3];
    };
    
    
    __global__ void calcNeighbors(const DataPoint* points,
      const float doubleRadius, bool* neighbors) {
    
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
      float dis = 0.0f;
      while (tid < N) {
       DataPoint p1 = points[tid];
       for (int i=0; i<N; i++) {
           DataPoint p2 = points[i];
           dis = 0;
           dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
               (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
               (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
           if (dis <= doubleRadius) {
               neighbors[tid*N+i] = true;
           } else {
               neighbors[tid*N+i] = false;
           }
       }
    
       tid += blockDim.x * gridDim.x;
      }
    }
    
    #ifdef USE_CONSTANT
    __constant__ float cpx[N];
    __constant__ float cpy[N];
    __constant__ float cpz[N];
    #endif
    
    __global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
    
      int tid = threadIdx.x+blockDim.x*blockIdx.x;
    
      while (tid < N) {
        float p1x = pts_x[tid];
        float p1y = pts_y[tid];
        float p1z = pts_z[tid];
        for (int i = N-1; i > tid; i--){
          float p2x, p2y, p2z;
    #ifdef USE_CONSTANT
          p2x = cpx[i];
          p2y = cpy[i];
          p2z = cpz[i];
    #else
          p2x = pts_x[i];
          p2y = pts_y[i];
          p2z = pts_z[i];
    #endif
          float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
          neighbors[i*N+tid] = (dis <= doubleRadius);
          }
        tid += blockDim.x * gridDim.x;
      }
    }
    
    __global__ void calcNeighbors3(const DataPoint* points,
      const float doubleRadius, bool* neighbors) {
    __shared__ DataPoint sharedpoints[nTPB];
    
    int start = blockIdx.x * blockDim.x;
    int len = start+threadIdx.x;
    if (len < N) {
        sharedpoints[threadIdx.x] = points[len];
    }
    len = imin(N, blockDim.x + start);
    __syncthreads();
    
    int tid = threadIdx.x;
    float dis;
    while (tid < N) {
        DataPoint p1 = points[tid];
        for (int i=start; i<len; i++) {
           dis = 0;
           dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
               (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
               (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
           if (dis <= doubleRadius) {
               neighbors[i*N+tid] = true;
           } else {
               neighbors[i*N+tid] = false;
           }
    
        }
    
        tid += blockDim.x;
    }
    }
    
    
    int main(){
    
      float *dx, *dy, *dz, *hx, *hy, *hz;
      DataPoint *dp, *hp;
      bool *dn, *hn1, *hn2, *hn3;
      hx =(float *)malloc(N*sizeof(float));
      hy =(float *)malloc(N*sizeof(float));
      hz =(float *)malloc(N*sizeof(float));
      hp =(DataPoint *)malloc(N*sizeof(DataPoint));
      hn1=(bool *)malloc(N*N*sizeof(bool));
      hn2=(bool *)malloc(N*N*sizeof(bool));
      hn3=(bool *)malloc(N*N*sizeof(bool));
      cudaMalloc(&dx, N*sizeof(float));
      cudaMalloc(&dy, N*sizeof(float));
      cudaMalloc(&dz, N*sizeof(float));
      cudaMalloc(&dp, N*sizeof(DataPoint));
      cudaMalloc(&dn, N*N*sizeof(bool));
    
      for (int i =0; i < N; i++){
        hx[i] = rand()/(float)RAND_MAX;
        hy[i] = rand()/(float)RAND_MAX;
        hz[i] = rand()/(float)RAND_MAX;
        hp[i].pfDimens[0] = hx[i];
        hp[i].pfDimens[1] = hy[i];
        hp[i].pfDimens[2] = hz[i];}
    
      cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
    #ifdef USE_CONSTANT
      cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
      cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
      cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
    #endif
      // warm-up
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t1 = dtime_usec(0);
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 1 error");
      t1 = dtime_usec(t1);
      cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      // warm-up
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t2 = dtime_usec(0);
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 2 error");
      t2 = dtime_usec(t2);
      cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      // warm-up
      calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t3 = dtime_usec(0);
      calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 3 error");
      t3 = dtime_usec(t3);
      cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      cudaCheckErrors("some error");
      printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
      // results validation
      for (int i = 0; i < N; i++)
        for (int j = i+1; j < N; j++)
          if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
      for (int i = 0; i < N*N; i++)
        if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
      return 0;
    }
    $ nvcc -arch=sm_35 -o t749 t749.cu
    $ ./t749
    t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
    $
    
    $cat t749.cu
    #包括
    #包括
    
    #定义imin(X,Y)((X)此外,如所述,此算法将错误地将每个点识别为其自己的邻域该算法已经过测试,消除了一半的计算,它可能会添加更多指令。并指定邻域[tidN+i](现在改为邻域[iN+tid]进行内存合并)和邻域[i*N+tid]的值,可能需要很多时间,我想。我已经尝试过对结构使用_ualign_uu(16),但是没有用。谢谢。但是如果我想使用共享内存,我尝试过共享内存,但是没有用,我不知道为什么会发生这种情况。我在启动内核时只使用(N-threadsPerBlock+1)/threadsPerBlock块。我使用while(tid$ cat t749.cu #include <stdio.h> #define N 4096 // if N is 16K/3 or less, we can use constant #define USE_CONSTANT #define THRESH 0.2f #define nTPB 256 #define nBLK (N/nTPB+1) #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) #include <time.h> #include <sys/time.h> #define USECPSEC 1000000ULL unsigned long long dtime_usec(unsigned long long start){ timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start; } struct DataPoint { float pfDimens[3]; }; __global__ void calcNeighbors(const DataPoint* points, const float doubleRadius, bool* neighbors) { int tid = threadIdx.x + blockIdx.x * blockDim.x; float dis = 0.0f; while (tid < N) { DataPoint p1 = points[tid]; for (int i=0; i<N; i++) { DataPoint p2 = points[i]; dis = 0; dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) + (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) + (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]); if (dis <= doubleRadius) { neighbors[tid*N+i] = true; } else { neighbors[tid*N+i] = false; } } tid += blockDim.x * gridDim.x; } } #ifdef USE_CONSTANT __constant__ float cpx[N]; __constant__ float cpy[N]; __constant__ float cpz[N]; #endif __global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) { int tid = threadIdx.x+blockDim.x*blockIdx.x; while (tid < N) { float p1x = pts_x[tid]; float p1y = pts_y[tid]; float p1z = pts_z[tid]; for (int i = N-1; i > tid; i--){ float p2x, p2y, p2z; #ifdef USE_CONSTANT p2x = cpx[i]; p2y = cpy[i]; p2z = cpz[i]; #else p2x = pts_x[i]; p2y = pts_y[i]; p2z = pts_z[i]; #endif float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z)); neighbors[i*N+tid] = (dis <= doubleRadius); } tid += blockDim.x * gridDim.x; } } int main(){ float *dx, *dy, *dz, *hx, *hy, *hz; DataPoint *dp, *hp; bool *dn, *hn1, *hn2; hx =(float *)malloc(N*sizeof(float)); hy =(float *)malloc(N*sizeof(float)); hz =(float *)malloc(N*sizeof(float)); hp =(DataPoint *)malloc(N*sizeof(DataPoint)); hn1=(bool *)malloc(N*N*sizeof(bool)); hn2=(bool *)malloc(N*N*sizeof(bool)); cudaMalloc(&dx, N*sizeof(float)); cudaMalloc(&dy, N*sizeof(float)); cudaMalloc(&dz, N*sizeof(float)); cudaMalloc(&dp, N*sizeof(DataPoint)); cudaMalloc(&dn, N*N*sizeof(bool)); for (int i =0; i < N; i++){ hx[i] = rand()/(float)RAND_MAX; hy[i] = rand()/(float)RAND_MAX; hz[i] = rand()/(float)RAND_MAX; hp[i].pfDimens[0] = hx[i]; hp[i].pfDimens[1] = hy[i]; hp[i].pfDimens[2] = hz[i];} cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice); // warm-up calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn); cudaDeviceSynchronize(); cudaMemset(dn, 0, N*N*sizeof(bool)); unsigned long long t1 = dtime_usec(0); calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn); cudaDeviceSynchronize(); cudaCheckErrors("kernel 1 error"); t1 = dtime_usec(t1); cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost); // warm-up calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn); cudaDeviceSynchronize(); cudaMemset(dn, 0, N*N*sizeof(bool)); unsigned long long t2 = dtime_usec(0); calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn); cudaDeviceSynchronize(); cudaCheckErrors("kernel 2 error"); t2 = dtime_usec(t2); cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost); cudaCheckErrors("some error"); printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC); // results validation for (int i = 0; i < N; i++) for (int j = i+1; j < N; j++) if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;} return 0; } $ nvcc -arch=sm_35 -o t749 t749.cu $ ./t749 t1: 0.004903s, t2: 0.001395s $
    $ ./t749
    t1: 0.267107s, t2: 0.008209s
    $
    
    $ cat t749.cu
    #include <stdio.h>
    #include <math.h>
    
    #define imin(X,Y) ((X)<(Y))?(X):(Y)
    
    #define N 32768
    // if N is 16K/3 or less, we can use constant
    // #define USE_CONSTANT
    #define THRESH 0.2f
    #define nTPB 256
    #define nBLK (N/nTPB+1)
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    struct DataPoint {
        float pfDimens[3];
    };
    
    
    __global__ void calcNeighbors(const DataPoint* points,
      const float doubleRadius, bool* neighbors) {
    
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
      float dis = 0.0f;
      while (tid < N) {
       DataPoint p1 = points[tid];
       for (int i=0; i<N; i++) {
           DataPoint p2 = points[i];
           dis = 0;
           dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
               (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
               (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
           if (dis <= doubleRadius) {
               neighbors[tid*N+i] = true;
           } else {
               neighbors[tid*N+i] = false;
           }
       }
    
       tid += blockDim.x * gridDim.x;
      }
    }
    
    #ifdef USE_CONSTANT
    __constant__ float cpx[N];
    __constant__ float cpy[N];
    __constant__ float cpz[N];
    #endif
    
    __global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
    
      int tid = threadIdx.x+blockDim.x*blockIdx.x;
    
      while (tid < N) {
        float p1x = pts_x[tid];
        float p1y = pts_y[tid];
        float p1z = pts_z[tid];
        for (int i = N-1; i > tid; i--){
          float p2x, p2y, p2z;
    #ifdef USE_CONSTANT
          p2x = cpx[i];
          p2y = cpy[i];
          p2z = cpz[i];
    #else
          p2x = pts_x[i];
          p2y = pts_y[i];
          p2z = pts_z[i];
    #endif
          float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
          neighbors[i*N+tid] = (dis <= doubleRadius);
          }
        tid += blockDim.x * gridDim.x;
      }
    }
    
    __global__ void calcNeighbors3(const DataPoint* points,
      const float doubleRadius, bool* neighbors) {
    __shared__ DataPoint sharedpoints[nTPB];
    
    int start = blockIdx.x * blockDim.x;
    int len = start+threadIdx.x;
    if (len < N) {
        sharedpoints[threadIdx.x] = points[len];
    }
    len = imin(N, blockDim.x + start);
    __syncthreads();
    
    int tid = threadIdx.x;
    float dis;
    while (tid < N) {
        DataPoint p1 = points[tid];
        for (int i=start; i<len; i++) {
           dis = 0;
           dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
               (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
               (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
           if (dis <= doubleRadius) {
               neighbors[i*N+tid] = true;
           } else {
               neighbors[i*N+tid] = false;
           }
    
        }
    
        tid += blockDim.x;
    }
    }
    
    
    int main(){
    
      float *dx, *dy, *dz, *hx, *hy, *hz;
      DataPoint *dp, *hp;
      bool *dn, *hn1, *hn2, *hn3;
      hx =(float *)malloc(N*sizeof(float));
      hy =(float *)malloc(N*sizeof(float));
      hz =(float *)malloc(N*sizeof(float));
      hp =(DataPoint *)malloc(N*sizeof(DataPoint));
      hn1=(bool *)malloc(N*N*sizeof(bool));
      hn2=(bool *)malloc(N*N*sizeof(bool));
      hn3=(bool *)malloc(N*N*sizeof(bool));
      cudaMalloc(&dx, N*sizeof(float));
      cudaMalloc(&dy, N*sizeof(float));
      cudaMalloc(&dz, N*sizeof(float));
      cudaMalloc(&dp, N*sizeof(DataPoint));
      cudaMalloc(&dn, N*N*sizeof(bool));
    
      for (int i =0; i < N; i++){
        hx[i] = rand()/(float)RAND_MAX;
        hy[i] = rand()/(float)RAND_MAX;
        hz[i] = rand()/(float)RAND_MAX;
        hp[i].pfDimens[0] = hx[i];
        hp[i].pfDimens[1] = hy[i];
        hp[i].pfDimens[2] = hz[i];}
    
      cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
    #ifdef USE_CONSTANT
      cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
      cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
      cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
    #endif
      // warm-up
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t1 = dtime_usec(0);
      calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 1 error");
      t1 = dtime_usec(t1);
      cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      // warm-up
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t2 = dtime_usec(0);
      calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 2 error");
      t2 = dtime_usec(t2);
      cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      // warm-up
      calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaMemset(dn, 0, N*N*sizeof(bool));
      unsigned long long t3 = dtime_usec(0);
      calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel 3 error");
      t3 = dtime_usec(t3);
      cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
      cudaCheckErrors("some error");
      printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
      // results validation
      for (int i = 0; i < N; i++)
        for (int j = i+1; j < N; j++)
          if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
      for (int i = 0; i < N*N; i++)
        if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
      return 0;
    }
    $ nvcc -arch=sm_35 -o t749 t749.cu
    $ ./t749
    t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
    $