CUDA，在CPU和GPU之间传输_Cuda_Cpu

CUDA，在CPU和GPU之间传输

cuda

CUDA，在CPU和GPU之间传输,cuda,cpu,Cuda,Cpu,我有这个 cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost); cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice); 我必须通过将输入方向指向输出方向（假设）来避免这些Memcpy。我该怎么做这是完整的代码： __global__ void medianFilter1D_col( unsigned cha

我有这个

    cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
    cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);

我必须通过将输入方向指向输出方向（假设）来避免这些Memcpy。我该怎么做

这是完整的代码：

__global__ void medianFilter1D_col(
    unsigned char *d_output, 
    unsigned char *d_input)
{
    int col, row;
    unsigned char temp;
    int idx, idx_south, idx_north, idx_west, idx_east, idx_north_west, idx_north_east, idx_south_east, idx_south_west;
    int numcols = WIDTH + 2;

    row = blockIdx.x * blockDim.x + threadIdx.x + 1;

    for (col = 1; col <= WIDTH; ++col)
    {
        unsigned char neighborhood[9];
        idx = row * numcols + col;

        idx_south = (row - 1) * numcols + col;
        idx_north = (row + 1) * numcols + col;
        idx_west = row * numcols + (col - 1);
        idx_east = row * numcols + (col + 1);
        idx_north_east = (row + 1) * numcols + (col + 1);
        idx_north_west = (row + 1) * numcols + (col - 1);
        idx_south_east = (row - 1) * numcols + (col + 1);
        idx_south_west = (row - 1) * numcols + (col - 1);

        neighborhood[0]= d_input[ idx_south_west ];
        neighborhood[1]= d_input[ idx_south ];
        neighborhood[2]= d_input[ idx_south_east ];
        neighborhood[3]= d_input[ idx_west ];
        neighborhood[4]= d_input[ idx ];
        neighborhood[5]= d_input[ idx_east ];
        neighborhood[6]= d_input[ idx_north_west ];
        neighborhood[7]= d_input[ idx_north ];
        neighborhood[8]= d_input[ idx_north_east ];

        for (unsigned int j=0; j<5; ++j)
        {
            int min=j;
            for (unsigned int i=j+1; i<9; ++i)
                if (neighborhood[i] < neighborhood[min])
                    min=i;

            temp=neighborhood[j];
            neighborhood[j]=neighborhood[min];
            neighborhood[min]=temp;
        }

        d_output[idx] = neighborhood[4];
    }
}

int main(int argc, char *argv[])
{
    int x, y;
    int i;
    int errors;

    double start_time_inc_data, end_time_inc_data;
    double cpu_start_time, cpu_end_time;

    unsigned char *d_input, *d_output, *d_edge, *tmp;

    unsigned char *input_image;
    unsigned char *output_image;
    int rows;
    int cols;

    // Alojamos memoria en el host para alojar la imagen
    input_image = (unsigned char*)calloc(((HEIGHT * WIDTH) * 1), sizeof(unsigned char));
    // Leemos la imagen 
    BMP Image;
    Image.ReadFromFile("lena_1024_noise.bmp");
    for( int i=0 ; i < Image.TellHeight() ; i++)
        for( int j=0 ; j < Image.TellWidth() ; j++)
            input_image[i*WIDTH+j]=Image(i,j)->Red;
    // Inicializamos a cero el array de CPU para asegurar que el 
    // halo tiene valores correctos
    for (y = 0; y < HEIGHT + 2; y++)
        for (x = 0; x < WIDTH + 2; x++)
            host_input[y][x] = 0;
    // Copiamos la imagen al array de CPU con el halo
    for (y = 0; y < HEIGHT; y++)
        for (x = 0; x < WIDTH; x++)
            host_input[y + 1][x + 1] = input_image[y*WIDTH + x];

    // Calculamos memoria necesaria para alojar la imagen junto con el halo
    // en la memoria de la GPU.
    const int kMemSize = (WIDTH+2) * (HEIGHT+2) * sizeof(unsigned char);

    // Reservamos memoria en la GPU
    cudaMalloc(&d_input, kMemSize);
    cudaMalloc(&d_output, kMemSize);

    // Copiamos todos los arrays a la memoria de la GPU.
    // Tenemos en cuenta dichas transferencias en el tiempo de ejecución.
    start_time_inc_data = get_current_time();

    cudaMemcpy( d_input, host_input, kMemSize, cudaMemcpyHostToDevice);
    cudaMemcpy( d_output, host_input, kMemSize, cudaMemcpyHostToDevice);

    // Aplicamos el filtro mediana un número determinado de iteraciones.
    for (i = 0; i < ITERATIONS; ++i) 
    {
        // Ejecución kernel 1D por filas
        dim3 blocksPerGrid(GRID_H, 1, 1);
        dim3 threadsPerBlock(BLOCK_H, 1, 1);
        //std::cout << "Grid size: (" << blocksPerGrid.x << ", " << blocksPerGrid.y << ", " << blocksPerGrid.z << ")\n";
        //std::cout << "Block size: (" << threadsPerBlock.x << ", " << threadsPerBlock.y << ", " << threadsPerBlock.z << ")\n";
        medianFilter1D_col<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);

        // Ejecución kernel 1D por columnas
        //TODO - Calcular tamaño de bloque y grid para la correcta ejecucion del kernel
        /*dim3 blocksPerGrid();
        dim3 threadsPerBlock();
        medianFilter1D_row<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);*/

        // Ejecución kernel 2D
        // TO DO - Calcular tamaño de bloque y grid para la correcta ejecucion del kernel
        /*dim3 blocksPerGrid(,);
        dim3 threadsPerBlock(,);
        medianFilter2D<<< blocksPerGrid, threadsPerBlock >>>(d_output, d_input);*/

        cudaThreadSynchronize();

        // Copiamos en la memoria de la CPU el resultado obtenido
        cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
        // Copiamos el resultado de la GPU hacia la entrada para procesar la siguiente iteración */
        cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);

        // TODO: Estas copias de memoria se pueden evitar, para ello comenta las
        // transferencias anteriores e intercambia los punteros d_input y d_output
        // para que la salida de esta iteración se convierta en la entrada de la
        // siguiente iteración del filtro mediana.
    }

    cudaMemcpy(gpu_output, d_input, kMemSize, cudaMemcpyDeviceToHost);
    end_time_inc_data = get_current_time();

    checkCUDAError("Filtro mediana CUDA: ");

    cpu_start_time = get_current_time();

    unsigned char temp;
    int idx, idx_south, idx_north, idx_west, idx_east, idx_north_west, idx_north_east, idx_south_east, idx_south_west;
    int numcols = WIDTH + 2;
    unsigned char neighborhood[9];

    for (i = 0; i < ITERATIONS; i++)
    {
        for (y = 0; y < HEIGHT; y++)
        {
            for (x = 0; x < WIDTH; x++) 
            {
                neighborhood[0]= host_input[ y+1 -1 ][ x+1 -1 ];
                neighborhood[1]= host_input[ y+1 -1 ][ x+1 ];
                neighborhood[2]= host_input[ y+1 -1][ x+1 +1 ];
                neighborhood[3]= host_input[ y+1 ][ x+1 -1 ];
                neighborhood[4]= host_input[ y+1 ][ x+1 ];
                neighborhood[5]= host_input[ y+1 ][ x+1 +1 ];
                neighborhood[6]= host_input[ y+1+1 ][ x+1 -1 ];
                neighborhood[7]= host_input[ y+1+1 ][ x+1 ];
                neighborhood[8]= host_input[ y+1+1 ][ x+1 +1];

                int j=0;

                for (j=0; j<5; ++j)
                {
                    // Encontramos el mínimo
                    int mini=j;
                    for (int l=j+1; l<9; ++l)
                    {
                            if (neighborhood[l] < neighborhood[mini])
                                    mini=l;
                    }

                    temp=neighborhood[j];
                    neighborhood[j]=neighborhood[mini];
                    neighborhood[mini]=temp;
                }

                host_output[y+1][x+1]=neighborhood[4];
            }
        }

        for (y = 0; y < HEIGHT; y++)
            for (x = 0; x < WIDTH; x++)
                host_input[y+1][x+1] = host_output[y+1][x+1];
    }

    cpu_end_time = get_current_time();


    errors = 0;
    for (y = 0; y < HEIGHT; y++)
    {
        for (x = 0; x < WIDTH; x++)
        {
            if ( host_input[y+1][x+1] != gpu_output[y+1][x+1])
            {
                errors++;
                printf("Error en %d,%d (CPU=%i, GPU=%i)\n", x, y, \
                    host_output[y+1][x+1], \
                    gpu_output[y+1][x+1]);
            }
        }
    }

    if (errors == 0)
        std::cout << "\n\n ***TEST CORRECTO*** \n\n\n";

    output_image = (unsigned char*)calloc(((WIDTH * HEIGHT) * 1), sizeof(unsigned char));

    for (y = 0; y < HEIGHT; y++)
        for (x = 0; x < WIDTH; x++)
            output_image[y*WIDTH+x] = gpu_output[y+1][x+1];

    cudaFree(d_input);
    cudaFree(d_output);

    printf("Tiempo ejecución GPU (Incluyendo transferencia de datos): %fs\n", \
        end_time_inc_data - start_time_inc_data);
    printf("Tiempo de ejecución en la CPU                          : %fs\n", \
         cpu_end_time - cpu_start_time);

    for( int i=0 ; i < Image.TellHeight() ; i++)
    {
        for( int j=0 ; j < Image.TellWidth() ; j++)
        {
            Image(i,j)->Red = output_image[i*WIDTH+j];
            Image(i,j)->Green = output_image[i*WIDTH+j];
            Image(i,j)->Blue = output_image[i*WIDTH+j];
        }
    }

    // Guardamos el resultado de aplicar el filtro en un nuevo fichero
    Image.WriteToFile("lena_1024_median.bmp");

    std::cout << "Resultado escrito en lena_1024_median.bmp\n";

    getchar();
    return 0;
}

#if _WIN32
    void getCurrentTimeStamp(timeStamp& _time)
    {
            QueryPerformanceCounter(&_time);
    }

    timeStamp getCurrentTimeStamp()
    {
            timeStamp tmp;
            QueryPerformanceCounter(&tmp);
            return tmp;
    }

    double getTimeMili()
    {
            timeStamp start;
            timeStamp dwFreq;
            QueryPerformanceFrequency(&dwFreq);
            QueryPerformanceCounter(&start);
            return double(start.QuadPart) / double(dwFreq.QuadPart);
    }
#endif 

double get_current_time()
{
    #if _WIN32 
        return getTimeMili();
    #else
        static int start = 0, startu = 0;
        struct timeval tval;
        double result;

        if (gettimeofday(&tval, NULL) == -1)
            result = -1.0;
        else if(!start) {
            start = tval.tv_sec;
            startu = tval.tv_usec;
            result = 0.0;
        }
        else
            result = (double) (tval.tv_sec - start) + 1.0e-6*(tval.tv_usec - startu);
        return result;
    #endif
}

\uuuuu全局\uuuuuu无效medianFilter1D\u列(
无符号字符*d_输出，
无符号字符*d_输入）
{
int col，世界其他地区；
无符号字符温度；
int idx、idx_南、idx_北、idx_西、idx_东、idx_北、idx_东、idx_南、idx_西；
int numcols=宽度+2；
行=blockIdx.x*blockDim.x+threadIdx.x+1；
对于（col=1；col可能，您有如下内容：
kernel1<<<...>>>(..., d_output, ...);
cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);
kernel2<<<...>>>(d_input, ...);

unsigned char *d_tmp = d_output;
d_output = d_input;
d_input = d_tmp;
for (i = 0; i < ITERATIONS; ++i) 
{
    // Ejecución kernel 1D por filas
    dim3 blocksPerGrid(GRID_H, 1, 1);
    dim3 threadsPerBlock(BLOCK_H, 1, 1);
    d_tmp = d_output;
    d_output = d_input;
    d_input = d_tmp;
    medianFilter1D_col<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);
    cudaDeviceSynchronize();
}

kernel1（…，d_输出，…）；
cudaMemcpy（gpu_输出、d_输出、kMemSize、cudaMemcpyDeviceToHost）；
cudaMemcpy（d_输入、gpu输出、kMemSize、cudaMemcpyHostToDevice）；
内核2（d_输入，…）；

在这种情况下，您可以通过以下方式避免这些复制操作：
kernel1<<<...>>>(..., d_output, ...);
kernel2<<<...>>>(d_output, ...);

kernel1（…，d_输出，…）；
内核2（d_输出，…）；

这实际上只是利用C中的指针，并不是CUDA特有的。使用普通C函数和指针也可以进行类似的操作
编辑：现在您已经提供了完整的代码，它可能看起来像这样：
kernel1<<<...>>>(..., d_output, ...);
cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);
kernel2<<<...>>>(d_input, ...);

unsigned char *d_tmp = d_output;
d_output = d_input;
d_input = d_tmp;
for (i = 0; i < ITERATIONS; ++i) 
{
    // Ejecución kernel 1D por filas
    dim3 blocksPerGrid(GRID_H, 1, 1);
    dim3 threadsPerBlock(BLOCK_H, 1, 1);
    d_tmp = d_output;
    d_output = d_input;
    d_input = d_tmp;
    medianFilter1D_col<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);
    cudaDeviceSynchronize();
}

无符号字符*d_tmp=d_输出；
d_输出=d_输入；
d_输入=d_tmp；
对于（i=0；i
我认为您需要提供更多的上下文，并可能显示代码的整体结构。事实上，我不理解您的问题。