Random CUDA&x27;s Mersenne捻线器,用于任意数量的螺纹

Random CUDA&x27;s Mersenne捻线器,用于任意数量的螺纹,random,cuda,mersenne-twister,curand,Random,Cuda,Mersenne Twister,Curand,CUDA实现的Mersenne捻线器(MT)随机数生成器的最大线程数/块数限制为256和200块/网格,即最大线程数为51200 因此,不可能启动使用MT的内核 kernel<<<blocksPerGrid, threadsPerBlock>>>(devMTGPStates, ...) 而n是线程总数 对于线程>51200,使用MT的最佳方法是什么 我的方法是为blocksPerGrid和threadsPerBlock使用常量值,例如,并在内核代码中使用以下

CUDA实现的
Mersenne捻线器
MT
)随机数生成器的最大线程数/块数限制为
256
200
块/网格,即最大线程数为
51200

因此,不可能启动使用MT的内核

kernel<<<blocksPerGrid, threadsPerBlock>>>(devMTGPStates, ...)
n
是线程总数

对于
线程>51200
,使用
MT
的最佳方法是什么

我的方法是为
blocksPerGrid
threadsPerBlock
使用常量值,例如
,并在内核代码中使用以下内容:

__global__ void kernel(curandStateMtgp32 *state, int n, ...) { 

    int id = threadIdx.x+blockIdx.x*blockDim.x;

    while (id < n) {

        float x = curand_normal(&state[blockIdx.x]);
        /* some more calls to curand_normal() followed
           by the algorithm that works with the data */

        id += blockDim.x*gridDim.x; 
    }
}
\uuuu全局\uuuu无效内核(curandStateMtgp32*状态,int n,…{
int id=threadIdx.x+blockIdx.x*blockDim.x;
while(id
我不确定这是否是正确的方法,或者它是否会以不希望的方式影响机器翻译状态


谢谢。

我建议你仔细彻底地阅读馆藏

当每个块使用256个线程(最多64个块)来生成数字时,MT API将最有效

如果您需要更多,您有多种选择:

  • 只需从现有状态集(即64)生成更多数字即可 块,256个线程),并在 需要它们的线程
  • 每个块使用多个状态(但这不允许您超过状态集中的总体限制,它只解决了单个块的需要。)
  • 创建多个具有独立种子(因此是独立状态集)的MT生成器

  • 一般来说,我认为您概述的内核没有问题,它与上面的选项1大致一致。但是,它不允许您超过51200个线程。(你的例子有16384个线程)

    我建议你仔细彻底地阅读文章

    当每个块使用256个线程(最多64个块)来生成数字时,MT API将最有效

    如果您需要更多,您有多种选择:

  • 只需从现有状态集(即64)生成更多数字即可 块,256个线程),并在 需要它们的线程
  • 每个块使用多个状态(但这不允许您超过状态集中的总体限制,它只解决了单个块的需要。)
  • 创建多个具有独立种子(因此是独立状态集)的MT生成器

  • 一般来说,我认为您概述的内核没有问题,它与上面的选项1大致一致。但是,它不允许您超过51200个线程。(您的示例有
    so16384条线程)

    根据Robert的回答,下面我将提供一个关于使用cuRAND的Mersenne捻线器处理任意数量线程的完整示例。我使用Robert的第一个选项从现有状态集中生成更多的数字,并将这些数字分布到需要它们的线程中

    // --- Generate random numbers with cuRAND's Mersenne Twister
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    #include <cuda.h>
    #include <curand_kernel.h>
    /* include MTGP host helper functions */
    #include <curand_mtgp32_host.h>
    
    #define BLOCKSIZE   256
    #define GRIDSIZE    64
    
    /*******************/
    /* GPU ERROR CHECK */
    /*******************/
    #define gpuErrchk(x) do { if((x) != cudaSuccess) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__); \
        return EXIT_FAILURE;}} while(0)
    
    #define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__); \
        return EXIT_FAILURE;}} while(0)
    
    /*******************/
    /* iDivUp FUNCTION */
    /*******************/
    __host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
    
    /*********************/
    /* GENERATION KERNEL */
    /*********************/
    __global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
    {
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        for (int k = tid; k < N; k += blockDim.x * gridDim.x)
            result[k] = curand_uniform(&state[blockIdx.x]);
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int N = 217 * 123;
    
        // --- Allocate space for results on host
        float *hostResults = (float *)malloc(N * sizeof(float));
    
        // --- Allocate and initialize space for results on device 
        float *devResults; gpuErrchk(cudaMalloc(&devResults, N * sizeof(float)));
        gpuErrchk(cudaMemset(devResults, 0, N * sizeof(float)));
    
        // --- Setup the pseudorandom number generator
        curandStateMtgp32 *devMTGPStates; gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
        mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
        CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
        //CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, 1234));
        CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
    
        // --- Generate pseudo-random sequence and copy to the host
        generate_kernel << <GRIDSIZE, BLOCKSIZE >> >(devMTGPStates, devResults, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(hostResults, devResults, N * sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Print results
        //for (int i = 0; i < N; i++) {
        for (int i = 0; i < 10; i++) {
            printf("%f\n", hostResults[i]);
        }
    
        // --- Cleanup
        gpuErrchk(cudaFree(devMTGPStates));
        gpuErrchk(cudaFree(devResults));
        free(hostResults);
    
        return 0;
    }
    
    /---使用cuRAND的Mersenne捻线器生成随机数
    #包括
    #包括
    #包括
    #包括
    #包括
    /*包括MTGP主机帮助程序函数*/
    #包括
    #定义块大小256
    #定义网格大小64
    /*******************/
    /*GPU错误检查*/
    /*******************/
    #定义gpuerchk(x)do{if((x)!=cudaSuccess){\
    printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
    返回EXIT_FAILURE;}}while(0)
    #定义CURAND_CALL(x)do{if((x)!=CURAND_STATUS\u SUCCESS){\
    printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
    返回EXIT_FAILURE;}}while(0)
    /*******************/
    /*iDivUp函数*/
    /*******************/
    __主机设备iDivUp(INTA,INTB){返回((a%b)!=0)?(a/b+1):(a/b);}
    /*********************/
    /*生成内核*/
    /*********************/
    __全局\uuuuu无效生成\u内核(curandStateMtgp32*\uuuuu限制\uuuuuu状态,浮点*\uuuu限制\uuuuuuuuu结果,常量int N)
    {
    int tid=threadIdx.x+blockIdx.x*blockDim.x;
    对于(int k=tid;k(devMTGPStates,devResults,N);
    gpuerchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuerchk(cudaMemcpy(hostResults,devResults,N*sizeof(float),cudaMemcpyDeviceToHost));
    //---打印结果
    //对于(int i=0;i// --- Generate random numbers with cuRAND's Mersenne Twister
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    #include <cuda.h>
    #include <curand_kernel.h>
    /* include MTGP host helper functions */
    #include <curand_mtgp32_host.h>
    
    #define BLOCKSIZE   256
    #define GRIDSIZE    64
    
    /*******************/
    /* GPU ERROR CHECK */
    /*******************/
    #define gpuErrchk(x) do { if((x) != cudaSuccess) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__); \
        return EXIT_FAILURE;}} while(0)
    
    #define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__); \
        return EXIT_FAILURE;}} while(0)
    
    /*******************/
    /* iDivUp FUNCTION */
    /*******************/
    __host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
    
    /*********************/
    /* GENERATION KERNEL */
    /*********************/
    __global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
    {
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        for (int k = tid; k < N; k += blockDim.x * gridDim.x)
            result[k] = curand_uniform(&state[blockIdx.x]);
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int N = 217 * 123;
    
        // --- Allocate space for results on host
        float *hostResults = (float *)malloc(N * sizeof(float));
    
        // --- Allocate and initialize space for results on device 
        float *devResults; gpuErrchk(cudaMalloc(&devResults, N * sizeof(float)));
        gpuErrchk(cudaMemset(devResults, 0, N * sizeof(float)));
    
        // --- Setup the pseudorandom number generator
        curandStateMtgp32 *devMTGPStates; gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
        mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
        CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
        //CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, 1234));
        CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
    
        // --- Generate pseudo-random sequence and copy to the host
        generate_kernel << <GRIDSIZE, BLOCKSIZE >> >(devMTGPStates, devResults, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(hostResults, devResults, N * sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Print results
        //for (int i = 0; i < N; i++) {
        for (int i = 0; i < 10; i++) {
            printf("%f\n", hostResults[i]);
        }
    
        // --- Cleanup
        gpuErrchk(cudaFree(devMTGPStates));
        gpuErrchk(cudaFree(devResults));
        free(hostResults);
    
        return 0;
    }