Matrix 在CUDA中处理矩阵:理解基本概念

Matrix 在CUDA中处理矩阵:理解基本概念,matrix,cuda,Matrix,Cuda,我正在构建一个CUDA内核,用有限差分法计算函数的数值雅可比矩阵;在我提供的示例中,它是平方函数(向量的每个条目都是平方的)。主机编码在线性内存中分配,而我在内核中使用二维索引 我的问题是,我没有找到一种方法在矩阵cudamaloc'ed的对角线上求和。我的尝试是使用语句threadIdx.x==blockIdx.x作为对角线的条件,但它只在0处对这两个矩阵求值为true 这是内核和编辑:根据评论中的建议,我发布了整个代码作为答案(main()基本相同,而内核则不同) 因此,我尝试处理函数调用的

我正在构建一个CUDA内核,用有限差分法计算函数的数值雅可比矩阵;在我提供的示例中,它是平方函数(向量的每个条目都是平方的)。主机编码在线性内存中分配,而我在内核中使用二维索引

我的问题是,我没有找到一种方法在矩阵
cudamaloc
'ed的对角线上求和。我的尝试是使用语句
threadIdx.x==blockIdx.x
作为对角线的条件,但它只在
0
处对这两个矩阵求值为
true

这是内核和编辑:根据评论中的建议,我发布了整个代码作为答案(
main()
基本相同,而内核则不同)

因此,我尝试处理函数调用的块分割。在内核中,我使用了
if(threadIdx.x==blockIdx.x){…}
为什么不正确?我这样问是因为在调试和让代码打印语句时,如果它们都为0,它只计算
true
。因此,
du[0]
是唯一的数值,矩阵变成
nan
。请注意,这种方法适用于我构建的第一个代码,而我使用

jacobian_kernel <<< N, N >>> (...)
jacobian_内核>(…)
因此,当
threadIdx.x==blockIdx.x
时,元素位于对角线上。但是,这种方法不再适用,因为现在我需要处理更大的
N
(可能大于1024,这是每个块的最大线程数)

即使矩阵被分成块和线程,我应该在那里放什么语句才能起作用?


让我知道我是否应该分享一些其他信息。

以下是我如何根据答案评论中的建议解决问题的。如果您将
helper\u cuda.h
helper\u string.h
放在同一目录中,或者将
-I
指令添加到cuda示例包含路径中,则该示例是可编译的。相关的更改仅在内核中;不过,
main()
中有一个小小的变化,因为我调用了double-the-resources来执行内核,但是线程块网格的
.y
轴根本没有被使用,所以它没有生成任何错误

#include <stdio.h> 
#include <stdlib.h> 
#include <iostream>
#include <math.h>
#include <assert.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>

#ifndef MAX
    #define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N       128
#define BLOCK_SIZE  128
#define NUM_BLOCKS  ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)

template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>

int main ()
{   
    float t0    = 0.; //float tn = 0.;
    float h     = 0.1;

    float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
    float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
    float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
    float* J = (float*)malloc(REAL*N*N);
    float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);

    float *d_u0;
    float *d_un;
    float *d_un_old;
    float *d_J;
    float *d_A;

    checkCudaErrors(cudaMalloc((void**)&d_u0,   REAL*N)); //printf("1: %p\n", d_u0);
    checkCudaErrors(cudaMalloc((void**)&d_un,   REAL*N)); //printf("2: %p\n", d_un);
    checkCudaErrors(cudaMalloc((void**)&d_un_old,   REAL*N)); //printf("3: %p\n", d_un_old);
    checkCudaErrors(cudaMalloc((void**)&d_J,    REAL*N*N)); //printf("4: %p\n", d_J);
    checkCudaErrors(cudaMalloc((void**)&d_A,    REAL*N*N)); //printf("4: %p\n", d_J);
    checkCudaErrors(cudaMemcpy(d_u0, u0,        REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
    checkCudaErrors(cudaMemcpy(d_un, un,        REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
    checkCudaErrors(cudaMemcpy(d_un_old, un_old,    REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
    checkCudaErrors(cudaMemcpy(d_J, J,      REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
    checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);

    dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
    dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
    size_t shm_size = N*REAL; //std::cout << shm_size << "\n";

    //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING 
    //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
    //WHILE NOW THE GRID IS 1D
    jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);

    checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);

    printmatrix( J, N, N);

    checkCudaErrors(cudaDeviceReset());
    free(u0);
    free(un);
    free(un_old);
    free(J);

}

template <typename T>
__global__ void jacobian_kernel (
                            const T * A,
                            T * J,
                            const T t0, 
                            const T tn,
                            const T h,
                            const T * u0, 
                            const T * un, 
                            const T * un_old)
{
    T cgamma = 2 - sqrtf(2);
    const unsigned int t = threadIdx.x;
    const unsigned int b = blockIdx.x;
    const unsigned int tid = t + b * blockDim.x;
    /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
    /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ T sm_temp_du;
    T* temp_du = &sm_temp_du;

    //HERE IS A RELEVANT CHANGE (*)
    if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
    {
        temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
        temp_dx[b][t] = un[t];
        //printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);

        //HERE IS A NOTE (**)
        if ( t == b )
        {
            //printf("t = %d, b = %d \n",t, b);
            if ( tn == t0 )
            {   
                *temp_du = u0[t]*0.001;

                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;

                temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
                temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );

                temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
                temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );

            }

            else
            {
                *temp_du = MAX( un[t] - un_old[t], 10e-6 );
                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;
            }
        ;
        }
//printf("du[%d] %f\n", tid, (*temp_du));
        __syncthreads();
        //printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
        //printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);

        //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
        //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
        matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
        matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
        __syncthreads();
        //printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
        //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
        //if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
        J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
    }
}                           

template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
    __shared__ float temp_u;
    temp_u = u[threadIdx.x];
    res[threadIdx.x] = h*powf( (temp_u), 2);
}

template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
    std::ofstream matrix_out;
    matrix_out.open( "heat_matrix.txt", std::ofstream::out);
    for( int i = 0; i < rows; i++)
    {
        for( int j = 0;  j <cols; j++)
        {
            double next = mat[i + N*j];
            matrix_out << ( (next >= 0) ? " " : "") << next << " ";
        }
        matrix_out << "\n";
    }   
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括“helper_cuda.h”
#包括“helper_string.h”
#包括
#ifndef最大值
#定义最大值(a,b)((a>b)?a:b)
#恩迪夫
#定义实尺寸(浮动)
#定义N 128
#定义块大小128
#定义块数((N*N+块大小-1)/块大小)
模板
内联void打印矩阵(T mat,int rows,int cols);
模板
__全局无效雅可比核(常数T*A,T*J,常数T t0,常数T tn,常数T h,常数T*u0,常数T*un,常数T*un旧);
模板
__设备无效功能(常数T,常数T u[],常数T h=1);
模板
int main()
{   
浮点数t0=0;//浮点数tn=0。;
浮动h=0.1;
float*u0=(float*)malloc(REAL*N);for(inti=0;idim3 dimGrid(NUM_BLOCKS);std::cout CUDA不是C!@Olaf这是一个很好的观点,只是标题CUDA C编程指南有误导性…请随时向Nvidia报告,堆栈溢出或C标准委员会不能对名称“C”的任何错误使用负责。在添加一个标签之前,请阅读标签信息。标签与添加标签以构成语义的程序不同。每个标签都代表它自己。你的MCVE与你在问题中发布的代码不匹配。关于MCVE代码/链接(启动1D网格和块)您仅获得1份
du[0]
打印输出的原因是由于语句
if(tid
tid
是一个全局唯一的线程索引。因此,第一个块得到16个线程,其
tid
值为0..15。第二个块得到16个线程,其
tid
值为16..31,依此类推,第三个和第四个块。现在,
N
是8。这些块中哪一个线程的
tid
小于8?只有t第一个块。所以对于第一个块,唯一的时间
(t==b)
是线程0。所以不是你自己的个人调试聊天室。它是用来提问和回答的,每个问题一个问题。当你问一个调试问题时,你应该在问题本身(read)中提供a,而不是在一个会消失a的外部链接中
jacobian_kernel <<< N, N >>> (...)
#include <stdio.h> 
#include <stdlib.h> 
#include <iostream>
#include <math.h>
#include <assert.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>

#ifndef MAX
    #define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N       128
#define BLOCK_SIZE  128
#define NUM_BLOCKS  ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)

template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>

int main ()
{   
    float t0    = 0.; //float tn = 0.;
    float h     = 0.1;

    float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
    float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
    float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
    float* J = (float*)malloc(REAL*N*N);
    float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);

    float *d_u0;
    float *d_un;
    float *d_un_old;
    float *d_J;
    float *d_A;

    checkCudaErrors(cudaMalloc((void**)&d_u0,   REAL*N)); //printf("1: %p\n", d_u0);
    checkCudaErrors(cudaMalloc((void**)&d_un,   REAL*N)); //printf("2: %p\n", d_un);
    checkCudaErrors(cudaMalloc((void**)&d_un_old,   REAL*N)); //printf("3: %p\n", d_un_old);
    checkCudaErrors(cudaMalloc((void**)&d_J,    REAL*N*N)); //printf("4: %p\n", d_J);
    checkCudaErrors(cudaMalloc((void**)&d_A,    REAL*N*N)); //printf("4: %p\n", d_J);
    checkCudaErrors(cudaMemcpy(d_u0, u0,        REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
    checkCudaErrors(cudaMemcpy(d_un, un,        REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
    checkCudaErrors(cudaMemcpy(d_un_old, un_old,    REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
    checkCudaErrors(cudaMemcpy(d_J, J,      REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
    checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);

    dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
    dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
    size_t shm_size = N*REAL; //std::cout << shm_size << "\n";

    //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING 
    //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
    //WHILE NOW THE GRID IS 1D
    jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);

    checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);

    printmatrix( J, N, N);

    checkCudaErrors(cudaDeviceReset());
    free(u0);
    free(un);
    free(un_old);
    free(J);

}

template <typename T>
__global__ void jacobian_kernel (
                            const T * A,
                            T * J,
                            const T t0, 
                            const T tn,
                            const T h,
                            const T * u0, 
                            const T * un, 
                            const T * un_old)
{
    T cgamma = 2 - sqrtf(2);
    const unsigned int t = threadIdx.x;
    const unsigned int b = blockIdx.x;
    const unsigned int tid = t + b * blockDim.x;
    /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
    /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ T sm_temp_du;
    T* temp_du = &sm_temp_du;

    //HERE IS A RELEVANT CHANGE (*)
    if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
    {
        temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
        temp_dx[b][t] = un[t];
        //printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);

        //HERE IS A NOTE (**)
        if ( t == b )
        {
            //printf("t = %d, b = %d \n",t, b);
            if ( tn == t0 )
            {   
                *temp_du = u0[t]*0.001;

                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;

                temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
                temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );

                temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
                temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );

            }

            else
            {
                *temp_du = MAX( un[t] - un_old[t], 10e-6 );
                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;
            }
        ;
        }
//printf("du[%d] %f\n", tid, (*temp_du));
        __syncthreads();
        //printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
        //printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);

        //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
        //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
        matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
        matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
        __syncthreads();
        //printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
        //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
        //if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
        J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
    }
}                           

template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
    __shared__ float temp_u;
    temp_u = u[threadIdx.x];
    res[threadIdx.x] = h*powf( (temp_u), 2);
}

template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
    std::ofstream matrix_out;
    matrix_out.open( "heat_matrix.txt", std::ofstream::out);
    for( int i = 0; i < rows; i++)
    {
        for( int j = 0;  j <cols; j++)
        {
            double next = mat[i + N*j];
            matrix_out << ( (next >= 0) ? " " : "") << next << " ";
        }
        matrix_out << "\n";
    }   
}