Cuda 内核中从for循环到多线程的变化_Cuda_Interpolation

Cuda 内核中从for循环到多线程的变化

cuda

Cuda 内核中从for循环到多线程的变化,cuda,interpolation,Cuda,Interpolation,我目前正在研究网格插值，在多线程方面遇到了一些问题。假设代码读取由2x2矩阵表示的贴图，然后对其进行插值以将点数增加100倍。在内核中使用for循环时，效果非常好插值前：插值后：当我尝试用线程更改for循环时，它产生了一些奇怪的结果。代替数字，它用-1填充结果矩阵下面是我在内核中使用for循环的工作代码 #include <stdlib.h> #include <stdio.h> #include <math.h> #include <fstre

我目前正在研究网格插值，在多线程方面遇到了一些问题。假设代码读取由2x2矩阵表示的贴图，然后对其进行插值以将点数增加100倍。在内核中使用for循环时，效果非常好

插值前：

插值后：

当我尝试用线程更改for循环时，它产生了一些奇怪的结果。代替数字，它用-1填充结果矩阵

下面是我在内核中使用for循环的工作代码

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"

using namespace std;

float Z[41][41];

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<float, 2, cudaReadModeElementType> tex;


__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);


    for (float i=0; i<n*k; i++)
    {
        for (float j=0; j<m*k; j++) 
        {
        f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
        }
    }

}

int main (void)
{
    // Start timer
    clock_t tStart = clock();

    // Size of map
    int n=41;
    int m=41;

    int g = 0;

    float numberOfInterpolationsPerSquare = 100;
    float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);

    size_t pitch, tex_ofs;
    float *f;
    float *r;
    float *map_d = 0;

    // Build read-Streams
    ifstream map;   

    //Create and open a txt file for MATLAB
    ofstream file;

    // Open data
    map.open("Map.txt", ios_base::in); 
    file.open("Bilinear.txt");

    // Store the map in a 2D array
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<m; j++)
        {
            map >> Z[i][j];
        }
    }

    // Allocate memory on host and device
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
    CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
    r = (float*)malloc(numberOfElements*sizeof(float));

    // Copy map from host to device
    CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));

    // Set texture mode to bilinear interpolation
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;

    // Bind the map to texture
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));

    // Checking for offset
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
   }

    // Launch Kernel
    kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
    CHECK_LAUNCH_ERROR();    
    CUDA_SAFE_CALL (cudaDeviceSynchronize());

    // Copy result from device to host
    cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Write results to file
    for(int h=0;h<numberOfElements;h++)
    {
        if(g==sqrt(numberOfElements))
        {
            file << endl;
            g=0;
        }
        file << r[h] << " ";
        g++;
    }

    // Free memory
    CUDA_SAFE_CALL (cudaUnbindTexture (tex));
    CUDA_SAFE_CALL (cudaFree (map_d));
    CUDA_SAFE_CALL (cudaFree (f));
    free( r );

    // Print out execution time
    printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}

有人知道为什么多线程版本不能工作吗

问候

在第二个内核中，

和

是

int

而不是

float

。因此

tex2D

中的

j/k

和

i/k

将导致整数除法。将k声明为浮点以避免整数除法

最初，内核是通过以下配置启动的：

//Find number of blocks 
int nthreads = 1024;
int blocksize = 512; 
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads); 

// Launch Kernel 
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);

//查找块数
int=1024；
int blocksize=512；
int nblocks=ceil（（n*m*numberOfInterpolationsPerSquare）/nthreads）；
//启动内核
核>（m，n，f，numberOfInterpolationsPerSquare）；

上述代码的问题在于，它将启动一个由1D块组成的1D网格，但在内核内部，使用了2D索引。内核需要2D网格/块配置才能正常工作。从内核代码的外观来看，以下网格/块配置应该可以工作：

float k = sqrt(numberOfInterpolationsPerSquare);

const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);

const dim3 dimBlock(16,16);

dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

float k=sqrt（numberOfInterpolationsPerSquare）；
const int threads_x=（int）ceil（n*k）；
const int threads_y=（int）ceil（m*k）；
常量dim3 dimBlock（16,16）；
dim3 dimGrid；
dimGrid.x=（threads_x+dimBlock.x-1）/dimBlock.x；
dimGrid.y=（threads_y+dimBlock.y-1）/dimBlock.y；
核（m，n，f，numberOfInterpolationsPerSquare）；

如果有人想尝试这段代码，下面是地图：如何启动多线程内核？另外，在第二个内核中，

和

是

int

而不是

float

。因此

tex2D

中的

j/k

和

i/k

将导致整数除法。考虑声明<代码> k>代码>为代码>浮点< /代码>。谢谢您的回复！我将I和j改为浮动，现在它产生了一些数字，但仍然有很多-1。内核的调用方式如下：//Find number of blocks int nthreads=1024；int blocksize=512；int nblocks=ceil（（nNumberOfInterpolationsPersquare）/nthreads）；//启动内核>（m，n，f，numberOfInterpolationsPerSquare）；您正在启动由1D块组成的1D网格，在内核中使用2D索引。启动2D网格时，总线程数至少等于有意义的

（m*sqrt（numberOfInterpolationsPerSquare））x（n*sqrt（numberOfInterpolationsPerSquare））

！现在我将内核启动更改为：int blockSize=512；int gridSize=n*sqrt（numberOfInterpolationsPerSquare）；dim3 dimGrid（网格大小，网格大小）；dim3 dimBlock（块大小，块大小）；核>（m，n，f，numberOfInterpolationsPerSquare）；现在我得到一个错误，说“配置参数无效”

float k = sqrt(numberOfInterpolationsPerSquare);

const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);

const dim3 dimBlock(16,16);

dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);