Cuda 内核中从for循环到多线程的变化

Cuda 内核中从for循环到多线程的变化,cuda,interpolation,Cuda,Interpolation,我目前正在研究网格插值,在多线程方面遇到了一些问题。假设代码读取由2x2矩阵表示的贴图,然后对其进行插值以将点数增加100倍。在内核中使用for循环时,效果非常好 插值前: 插值后: 当我尝试用线程更改for循环时,它产生了一些奇怪的结果。代替数字,它用-1填充结果矩阵 下面是我在内核中使用for循环的工作代码 #include <stdlib.h> #include <stdio.h> #include <math.h> #include <fstre

我目前正在研究网格插值,在多线程方面遇到了一些问题。假设代码读取由2x2矩阵表示的贴图,然后对其进行插值以将点数增加100倍。在内核中使用for循环时,效果非常好

插值前:

插值后:

当我尝试用线程更改for循环时,它产生了一些奇怪的结果。代替数字,它用-1填充结果矩阵

下面是我在内核中使用for循环的工作代码

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"

using namespace std;

float Z[41][41];

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<float, 2, cudaReadModeElementType> tex;


__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare) 
{
    int k = sqrt(numberOfInterpolationsPerSquare);


    for (float i=0; i<n*k; i++)
    {
        for (float j=0; j<m*k; j++) 
        {
        f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
        }
    }

}

int main (void)
{
    // Start timer
    clock_t tStart = clock();

    // Size of map
    int n=41;
    int m=41;

    int g = 0;

    float numberOfInterpolationsPerSquare = 100;
    float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);

    size_t pitch, tex_ofs;
    float *f;
    float *r;
    float *map_d = 0;

    // Build read-Streams
    ifstream map;   

    //Create and open a txt file for MATLAB
    ofstream file;

    // Open data
    map.open("Map.txt", ios_base::in); 
    file.open("Bilinear.txt");

    // Store the map in a 2D array
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<m; j++)
        {
            map >> Z[i][j];
        }
    }

    // Allocate memory on host and device
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
    CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
    r = (float*)malloc(numberOfElements*sizeof(float));

    // Copy map from host to device
    CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));

    // Set texture mode to bilinear interpolation
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;

    // Bind the map to texture
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));

    // Checking for offset
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
   }

    // Launch Kernel
    kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
    CHECK_LAUNCH_ERROR();    
    CUDA_SAFE_CALL (cudaDeviceSynchronize());

    // Copy result from device to host
    cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Write results to file
    for(int h=0;h<numberOfElements;h++)
    {
        if(g==sqrt(numberOfElements))
        {
            file << endl;
            g=0;
        }
        file << r[h] << " ";
        g++;
    }

    // Free memory
    CUDA_SAFE_CALL (cudaUnbindTexture (tex));
    CUDA_SAFE_CALL (cudaFree (map_d));
    CUDA_SAFE_CALL (cudaFree (f));
    free( r );

    // Print out execution time
    printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}
有人知道为什么多线程版本不能工作吗

问候


在第二个内核中,
i
j
int
而不是
float
。因此
tex2D
中的
j/k
i/k
将导致整数除法。将k声明为浮点以避免整数除法

最初,内核是通过以下配置启动的:

//Find number of blocks 
int nthreads = 1024;
int blocksize = 512; 
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads); 

// Launch Kernel 
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
//查找块数
int=1024;
int blocksize=512;
int nblocks=ceil((n*m*numberOfInterpolationsPerSquare)/nthreads);
//启动内核
核>(m,n,f,numberOfInterpolationsPerSquare);
上述代码的问题在于,它将启动一个由1D块组成的1D网格,但在内核内部,使用了2D索引。内核需要2D网格/块配置才能正常工作。从内核代码的外观来看,以下网格/块配置应该可以工作:

float k = sqrt(numberOfInterpolationsPerSquare);

const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);

const dim3 dimBlock(16,16);

dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);
float k=sqrt(numberOfInterpolationsPerSquare);
const int threads_x=(int)ceil(n*k);
const int threads_y=(int)ceil(m*k);
常量dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x=(threads_x+dimBlock.x-1)/dimBlock.x;
dimGrid.y=(threads_y+dimBlock.y-1)/dimBlock.y;
核(m,n,f,numberOfInterpolationsPerSquare);

如果有人想尝试这段代码,下面是地图:如何启动多线程内核?另外,在第二个内核中,
i
j
int
而不是
float
。因此
tex2D
中的
j/k
i/k
将导致整数除法。考虑声明<代码> k>代码>为代码>浮点< /代码>。谢谢您的回复!我将I和j改为浮动,现在它产生了一些数字,但仍然有很多-1。内核的调用方式如下://Find number of blocks int nthreads=1024;int blocksize=512;int nblocks=ceil((nNumberOfInterpolationsPersquare)/nthreads);//启动内核>(m,n,f,numberOfInterpolationsPerSquare);您正在启动由1D块组成的1D网格,在内核中使用2D索引。启动2D网格时,总线程数至少等于有意义的
(m*sqrt(numberOfInterpolationsPerSquare))x(n*sqrt(numberOfInterpolationsPerSquare))
!现在我将内核启动更改为:int blockSize=512;int gridSize=n*sqrt(numberOfInterpolationsPerSquare);dim3 dimGrid(网格大小,网格大小);dim3 dimBlock(块大小,块大小);核>(m,n,f,numberOfInterpolationsPerSquare);现在我得到一个错误,说“配置参数无效”
float k = sqrt(numberOfInterpolationsPerSquare);

const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);

const dim3 dimBlock(16,16);

dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;

kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);