Memory management 如何用埃拉托什筛来增加生成素数的极限?
为了计算质数的上限,我需要在程序中更改什么?Memory management 如何用埃拉托什筛来增加生成素数的极限?,memory-management,cuda,limit,primes,Memory Management,Cuda,Limit,Primes,为了计算质数的上限,我需要在程序中更改什么? 目前我的算法只能处理8500万个数字。在我看来,应该可以处理高达30亿的数字。 我在CUDA写我自己的埃拉托什尼筛的实现,我碰到了一堵墙。 到目前为止,该算法似乎适用于较小的数字(低于8500万) 但是,当我试图计算高达1亿、20亿、30亿的素数时,系统冻结(在CUDA设备中进行计算),几秒钟后,我的linux机器恢复正常(解冻),但CUDA程序崩溃并显示以下错误消息: CUDA error at prime.cu:129 code=6(cuda
- 目前我的算法只能处理8500万个数字。在我看来,应该可以处理高达30亿的数字。
CUDA error at prime.cu:129 code=6(cudaErrorLaunchTimeout) "cudaDeviceSynchronize()"
我有一个GTX780(3GB),我在一个字符数组中分配筛子,所以如果我要计算高达100000的素数,它将在设备中分配100000字节
我假设GPU最多允许30亿个数字,因为它有3 GB的内存,但是,它只允许我做8500万个top(8500万字节=0.08 GB)
这是我的素数.cu代码:
#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors() - NVIDIA_CUDA-6.0_Samples/common/inc
// #include <cuda.h>
// #include <cuda_runtime_api.h>
// #include <cuda_runtime.h>
typedef unsigned long long int uint64_t;
/******************************************************************************
* kernel that initializes the 1st couple of values in the primes array.
******************************************************************************/
__global__ static void sieveInitCUDA(char* primes)
{
primes[0] = 1; // value of 1 means the number is NOT prime
primes[1] = 1; // numbers "0" and "1" are not prime numbers
}
/******************************************************************************
* kernel for sieving the even numbers starting at 4.
******************************************************************************/
__global__ static void sieveEvenNumbersCUDA(char* primes, uint64_t max)
{
uint64_t index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 4;
if (index < max)
primes[index] = 1;
}
/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
* A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
* - maxRoot: the sqrt of max (the other input). we don't wanna make all threads
* compute this over and over again, so it's being passed in
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max,
const uint64_t maxRoot)
{
// get the starting index, sieve only odds starting at 3
// 3,5,7,9,11,13...
/* int index = blockIdx.x * blockDim.x + threadIdx.x + threadIdx.x + 3; */
// apparently the following indexing usage is faster than the one above. Hmm
int index = blockIdx.x * blockDim.x + threadIdx.x + 3;
// make sure index won't go out of bounds, also don't start the execution
// on numbers that are already composite
if (index < maxRoot && primes[index] == 0)
{
// mark off the composite numbers
for (int j = index * index; j < max; j += index)
{
primes[j] = 1;
}
}
}
/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
// query the Device and decide on the block size
int devID = 0; // the default device ID
cudaError_t error;
cudaDeviceProp deviceProp;
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("CUDA Device not ready or not supported\n");
printf("%s: cudaGetDevice returned error code %d, line(%d)\n", __FILE__, error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaGetDeviceProperties(&deviceProp, devID);
if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
{
printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
return EXIT_FAILURE;
}
// Use a larger block size for Fermi and above (see compute capability)
return (deviceProp.major < 2) ? 16 : 32;
}
/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
* primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
int blockSize = checkDevice();
if (blockSize == EXIT_FAILURE)
return;
char* d_Primes = NULL;
int sizePrimes = sizeof(char) * max;
uint64_t maxRoot = sqrt(max);
// allocate the primes on the device and set them to 0
checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));
// make sure that there are no errors...
checkCudaErrors(cudaPeekAtLastError());
// setup the execution configuration
dim3 dimBlock(blockSize);
dim3 dimGrid((maxRoot + dimBlock.x) / dimBlock.x);
dim3 dimGridEvens(((max + dimBlock.x) / dimBlock.x) / 2);
//////// debug
#ifdef DEBUG
printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
printf("dimGridEvens(%d, %d, %d)\n", dimGridEvens.x, dimGridEvens.y, dimGridEvens.z);
#endif
// call the kernel
// NOTE: no need to synchronize after each kernel
// http://stackoverflow.com/a/11889641/2261947
sieveInitCUDA<<<1, 1>>>(d_Primes); // launch a single thread to initialize
sieveEvenNumbersCUDA<<<dimGridEvens, dimBlock>>>(d_Primes, max);
sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max, maxRoot);
// check for kernel errors
checkCudaErrors(cudaPeekAtLastError());
checkCudaErrors(cudaDeviceSynchronize());
// copy the results back
checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));
// no memory leaks
checkCudaErrors(cudaFree(d_Primes));
}
#包括
#包括//checkCudaErrors()-NVIDIA_CUDA-6.0_Samples/common/inc
//#包括
//#包括
//#包括
typedef无符号长整数uint64\u t;
/******************************************************************************
*初始化primes数组中第一对值的内核。
******************************************************************************/
__全局u uuu静态无效sieveInitCUDA(字符*素数)
{
素数[0]=1;//值1表示该数字不是素数
素数[1]=1;//数字“0”和“1”不是素数
}
/******************************************************************************
*用于筛选从4开始的偶数的内核。
******************************************************************************/
__全局\uuuuuuuuuuuuuuuuuuuuuuuu静态无效筛网数CUDA(字符*素数,uint64最大值)
{
uint64_t index=blockIdx.x*blockDim.x+threadIdx.x+threadIdx.x+4;
如果(指数<最大值)
素数[指数]=1;
}
/******************************************************************************
*用eratosthenes筛求素数的核
*-素数:一组布尔。最初,所有数字都设置为“0”。
*“0”值表示该索引处的数字为素数。
*-max:素数数组的最大大小
*-maxRoot:max的sqrt(另一个输入)。我们不想做所有的线
*一遍又一遍地计算它,所以它被传入
******************************************************************************/
__全局静态空筛费拉托斯涅斯库达(字符*素数,uint64最大值,
常数uint64(最大根)
{
//获取起始指数,仅筛选从3开始的赔率
// 3,5,7,9,11,13...
/*int index=blockIdx.x*blockDim.x+threadIdx.x+threadIdx.x+3*/
//显然,下面的索引使用比上面的更快
int index=blockIdx.x*blockDim.x+threadIdx.x+3;
//确保索引不会超出范围,也不要开始执行
//关于已经合成的数字
if(索引int main()
{
int max = 85000000; // 85 million
char* primes = malloc(max);
// check that it allocated correctly...
memset(primes, 0, max);
genPrimesOnDevice(primes, max);
// if you wish to display results:
for (uint64_t i = 0; i < size; i++)
{
if (primes[i] == 0) // if the value is '0', then the number is prime
{
std::cout << i; // use printf if you are using c
if ((i + 1) != size)
std::cout << ", ";
}
}
free(primes);
}
CUDA error at prime.cu:129 code=6(cudaErrorLaunchTimeout) "cudaDeviceSynchronize()"