Cuda呼叫源';t无论规格如何,每个块分配的线程数都不能超过8个 我在C++中创建了埃拉托色尼的筛选器的并行版本。问题是我的内核调用(reduce0)似乎只为每个块分配8个线程,而不是我指定的256个。因为即使是第一个CUDA版本也允许每个块有512个线程,所以我的代码中一定有一些错误。任何帮助都将不胜感激 #include <iostream> #include <stdlib.h> #include <math.h> #include <time.h> #include <cutil.h> //#include <sieve_kernel.cu> using namespace std; //////////////////////////////////////////////////// int psum(int arg[], double n); int call_kernel(int primes[], int n); int findsmallest(int arg[], int f, double n); int sieve(int n); __global__ void reduce0(int *g_idata, int *g_odata); //////////////////////////////////////////////////// int main(){ int n = pow((double) 2, 8); int total = sieve(n); cout << "# primes" << endl << total << endl; return 0; } /////////////////////////////////////////////////// __global__ void reduce0(int *g_idata, int *g_odata) { extern __shared__ int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2 if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } ///////////////////////////////////////////////////// int call_kernel(int *primes, int n){ // Allocate and copy device arrays int *g_idevice; int *g_odevice; int size = n * sizeof(int); cudaMalloc(&g_idevice, size); cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice); cudaMalloc(&g_odevice, size); // Specify grid/block dimenstions and invoke the kernel dim3 dimGrid(1,1); dim3 dimBlock(256,1); reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice); // Copy device data back to primes cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost); //for (int i = 0; i < n; i++) { // cout << i << " " << primes[i] << endl; //} int total = primes[0]; cudaFree(g_idevice); cudaFree(g_odevice); return total; } ///////////////////////////////////////////////////////////////////// int findsmallest(int arg[], int f, double n){ int i = f; while(arg[i]!= 1 && i < n) { i++; } return i; } ////////////////////////////////////////////////////////////////////// int psum(int arg[], double n){ int total = 0; int i = 2; while(i < n){ if(arg[i] == 1){ total = total + 1; } i++; } return total; } ///////////////////////////////////////////////////////////////////////// int sieve(int n){ int* primes = NULL; int mult = 0; int k = 2; int i; int total; //primes = new int[n]; primes = new int[256]; for(i = 0; i < n; i++){ primes[i] = 1; } primes[0] = primes[1] = 0; while (k * k < n){ mult = k * k; while (mult < n) { primes[mult] = 0; mult = mult + k; } k = findsmallest(primes,k+1, n); } total = call_kernel(primes, n); //delete [] primes; //primes = NULL; return total; } #包括 #包括 #包括 #包括 #包括 //#包括 使用名称空间std; //////////////////////////////////////////////////// int-psum(int-arg[],双n); int-call_内核(int-primes[],int-n); int findsmallest(int arg[],int f,双n); int筛(int n); __全局无效还原0(int*g\u-idata,int*g\u-odata); //////////////////////////////////////////////////// int main(){ int n=功率((双)2,8); int总计=筛(n); cout

Cuda呼叫源';t无论规格如何,每个块分配的线程数都不能超过8个 我在C++中创建了埃拉托色尼的筛选器的并行版本。问题是我的内核调用(reduce0)似乎只为每个块分配8个线程,而不是我指定的256个。因为即使是第一个CUDA版本也允许每个块有512个线程,所以我的代码中一定有一些错误。任何帮助都将不胜感激 #include <iostream> #include <stdlib.h> #include <math.h> #include <time.h> #include <cutil.h> //#include <sieve_kernel.cu> using namespace std; //////////////////////////////////////////////////// int psum(int arg[], double n); int call_kernel(int primes[], int n); int findsmallest(int arg[], int f, double n); int sieve(int n); __global__ void reduce0(int *g_idata, int *g_odata); //////////////////////////////////////////////////// int main(){ int n = pow((double) 2, 8); int total = sieve(n); cout << "# primes" << endl << total << endl; return 0; } /////////////////////////////////////////////////// __global__ void reduce0(int *g_idata, int *g_odata) { extern __shared__ int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2 if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } ///////////////////////////////////////////////////// int call_kernel(int *primes, int n){ // Allocate and copy device arrays int *g_idevice; int *g_odevice; int size = n * sizeof(int); cudaMalloc(&g_idevice, size); cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice); cudaMalloc(&g_odevice, size); // Specify grid/block dimenstions and invoke the kernel dim3 dimGrid(1,1); dim3 dimBlock(256,1); reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice); // Copy device data back to primes cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost); //for (int i = 0; i < n; i++) { // cout << i << " " << primes[i] << endl; //} int total = primes[0]; cudaFree(g_idevice); cudaFree(g_odevice); return total; } ///////////////////////////////////////////////////////////////////// int findsmallest(int arg[], int f, double n){ int i = f; while(arg[i]!= 1 && i < n) { i++; } return i; } ////////////////////////////////////////////////////////////////////// int psum(int arg[], double n){ int total = 0; int i = 2; while(i < n){ if(arg[i] == 1){ total = total + 1; } i++; } return total; } ///////////////////////////////////////////////////////////////////////// int sieve(int n){ int* primes = NULL; int mult = 0; int k = 2; int i; int total; //primes = new int[n]; primes = new int[256]; for(i = 0; i < n; i++){ primes[i] = 1; } primes[0] = primes[1] = 0; while (k * k < n){ mult = k * k; while (mult < n) { primes[mult] = 0; mult = mult + k; } k = findsmallest(primes,k+1, n); } total = call_kernel(primes, n); //delete [] primes; //primes = NULL; return total; } #包括 #包括 #包括 #包括 #包括 //#包括 使用名称空间std; //////////////////////////////////////////////////// int-psum(int-arg[],双n); int-call_内核(int-primes[],int-n); int findsmallest(int arg[],int f,双n); int筛(int n); __全局无效还原0(int*g\u-idata,int*g\u-odata); //////////////////////////////////////////////////// int main(){ int n=功率((双)2,8); int总计=筛(n); cout,cuda,Cuda,您的内核正在使用动态分配的共享内存,但内核启动不包括任何分配,因此结果是内核将中止,因为该共享内存缓冲区上存在非法内存操作。如果您修改call_内核的这一部分,您会发现它可以正常工作,如下所示: // Specify grid/block dimenstions and invoke the kernel dim3 dimGrid(1,1); dim3 dimBlock(256,1); size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBl

您的内核正在使用动态分配的共享内存,但内核启动不包括任何分配,因此结果是内核将中止,因为该共享内存缓冲区上存在非法内存操作。如果您修改call_内核的这一部分,您会发现它可以正常工作,如下所示:

// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
//指定网格/块维度并调用内核
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size\u t shmsize=size\u t(dimBlock.x*dimBlock.y*dimBlock.z)*sizeof(int);
reduce0(g_idevice,g_odevice);
如果您在函数调用中包含了一些基本的错误检查,可能如下所示:

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}

// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
reduce0(g_idevice,g_odevice);
if(cudaPeekAtLastError()!=cudaSuccess){
库特