Parallel processing CUDA中的二元矩阵约化_Parallel Processing_Cuda_Nvidia_Pi

Parallel processing CUDA中的二元矩阵约化

parallel-processing cuda

Parallel processing CUDA中的二元矩阵约化,parallel-processing,cuda,nvidia,pi,Parallel Processing,Cuda,Nvidia,Pi,对于满足特定条件的所有单元格，我必须遍历虚矩阵的所有单元格m*n和add+1 我天真的解决方案如下： #include <stdio.h> __global__ void calculate_pi(int center, int *count) { int x = threadIdx.x; int y = blockIdx.x; if (x*x + y*y <= center*center) { *count++; } }

对于满足特定条件的所有单元格，我必须遍历虚矩阵的所有单元格

m*n

和

add+1

我天真的解决方案如下：

#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x;
    int y = blockIdx.x;

    if (x*x + y*y <= center*center) {
        *count++;
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);

    int h_count = 0;
    int *d_count;

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<l,l>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);

    printf("Sum: %d\n", h_count);

    return 0;
}

#包括
__全局无效计算（整数中心，整数*计数）{
int x=threadIdx.x；
int y=blockIdx.x；
如果（x*x+y*y您的代码至少有两个问题：
您的内核代码无法与普通的add-here一起正常工作：
*count++;

这是因为多个线程正在尝试同时执行此操作，而CUDA不会自动为您排序。出于此解释的目的，我们将使用atomicAdd（）
解决此问题，尽管也可以使用其他方法
符号不属于这里：
cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
           ^

我认为这只是一个输入错误，因为您在随后的cudaMemcpy
操作中正确地执行了该操作：
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);


此方法（一个维度使用threadIdx.x
，另一个维度使用blockIdx.x
有效地创建一个正方形线程数组）将仅适用于交互
值，该值将导致l
值为1024或更少，因为CUDA线程块限制为1024个线程，并且您在内核启动中使用l
作为线程块的大小。要解决此问题，您需要学习如何创建任意尺寸的CUDA 2D网格，请使用d适当调整内核启动和内核索引计算。目前，我们只需确保计算的l
值在代码设计的范围内
以下是解决上述问题的示例：
$ cat t1590.cu
#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x;
    int y = blockIdx.x;

    if (x*x + y*y <= center*center) {
        atomicAdd(count, 1);
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);
    if ((l > 1024) || (l < 1)) {printf("Error: interactions out of range\n"); return 0;}
    int h_count = 0;
    int *d_count;

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<l,l>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);
    cudaError_t err = cudaGetLastError();
    if (err == cudaSuccess){
      printf("Sum: %d\n", h_count);
      printf("fraction satisfying test:  %f\n", h_count/(float)interactions);
      }
    else
      printf("CUDA error: %s\n", cudaGetErrorString(err));
    return 0;
}
$ nvcc -o t1590 t1590.cu
$ ./t1590
Enter the number of interactions: 1048576
Sum: 206381
fraction satisfying test:  0.196820
$

这是在l
的基础上启动一个2D网格，应该可以进行至少10亿次交互
我不理解你的问题——你有一个“虚拟”矩阵（假设这意味着它实际上不存在，而不是复杂），然后你想有条件地增加该矩阵中的条目，也就是说，你想更改一个不存在的矩阵中的条目。这怎么可能呢？恕我直言，@talonmies，你完全错过了O/P使用{threadIdx，blockIdx}.x
进行间接映射的巧妙想法（暂时不提及
-Shevron运算符实际使用参数的状态）a（通常非常大的试错）pi生成算法的[x，y]
-值（早期C/S课程中的经典教科书示例）是的，想象是一种情况下的法律表达，在这种情况下，主体不需要被实例化为一个真实的存在，而是一个合法的利益对象，并进一步与之合作：o）我使用虚拟数组这个术语是因为我不想实例化一个矩阵，例如50000 x 50000。我读过关于Reduce技术的文章，它可以解决我的问题，但是如果我做对了，我总是需要一个巨大的数组。谢谢！我不知道这些原子函数。关于内核，我犯了另一个小错误：…int xc=x-center；
int yc=y-center；
如果（xc*xc+yc*yc
…关于近似的Pi值，它应该是这样的：printf（“Pi:%lf\n”，4.0*h_计数/（l*l））
#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x+blockDim.x*blockIdx.x;
    int y = threadIdx.y+blockDim.y*blockIdx.y;

    if (x*x + y*y <= center*center) {
        atomicAdd(count, 1);
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);
    int h_count = 0;
    int *d_count;
    const int bs = 32;
    dim3 threads(bs, bs);
    dim3 blocks((l+threads.x-1)/threads.x, (l+threads.y-1)/threads.y);

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<blocks,threads>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);
    cudaError_t err = cudaGetLastError();
    if (err == cudaSuccess){
      printf("Sum: %d\n", h_count);
      printf("fraction satisfying test:  %f\n", h_count/(float)interactions);
      }
    else
      printf("CUDA error: %s\n", cudaGetErrorString(err));
    return 0;
}