Matrix 在CUDA中处理矩阵：理解基本概念_Matrix_Cuda

Matrix 在CUDA中处理矩阵：理解基本概念

matrix cuda

Matrix 在CUDA中处理矩阵：理解基本概念,matrix,cuda,Matrix,Cuda,我正在构建一个CUDA内核，用有限差分法计算函数的数值雅可比矩阵；在我提供的示例中，它是平方函数（向量的每个条目都是平方的）。主机编码在线性内存中分配，而我在内核中使用二维索引我的问题是，我没有找到一种方法在矩阵cudamaloc'ed的对角线上求和。我的尝试是使用语句threadIdx.x==blockIdx.x作为对角线的条件，但它只在0处对这两个矩阵求值为true 这是内核和编辑：根据评论中的建议，我发布了整个代码作为答案（main（）基本相同，而内核则不同）因此，我尝试处理函数调用的

我正在构建一个CUDA内核，用有限差分法计算函数的数值雅可比矩阵；在我提供的示例中，它是平方函数（向量的每个条目都是平方的）。主机编码在线性内存中分配，而我在内核中使用二维索引

我的问题是，我没有找到一种方法在矩阵
cudamaloc
'ed的对角线上求和。我的尝试是使用语句
threadIdx.x==blockIdx.x
作为对角线的条件，但它只在
0
处对这两个矩阵求值为
true
这是内核和编辑：根据评论中的建议，我发布了整个代码作为答案（
main（）
基本相同，而内核则不同）
因此，我尝试处理函数调用的块分割。在内核中，我使用了
if（threadIdx.x==blockIdx.x）{…}
为什么不正确？我这样问是因为在调试和让代码打印语句时，如果它们都为0，它只计算
true
。因此，
du[0]
是唯一的数值，矩阵变成
nan
。请注意，这种方法适用于我构建的第一个代码，而我使用

jacobian_kernel <<< N, N >>> (...)

jacobian_内核>（…）
因此，当
threadIdx.x==blockIdx.x
时，元素位于对角线上。但是，这种方法不再适用，因为现在我需要处理更大的
N
（可能大于1024，这是每个块的最大线程数）
即使矩阵被分成块和线程，我应该在那里放什么语句才能起作用？

让我知道我是否应该分享一些其他信息。
以下是我如何根据答案评论中的建议解决问题的。如果您将
helper\u cuda.h
和
helper\u string.h
放在同一目录中，或者将
-I
指令添加到cuda示例包含路径中，则该示例是可编译的。相关的更改仅在内核中；不过，
main（）
中有一个小小的变化，因为我调用了double-the-resources来执行内核，但是线程块网格的
.y
轴根本没有被使用，所以它没有生成任何错误

#include <stdio.h> #include <stdlib.h> #include <iostream> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #include "helper_cuda.h" #include "helper_string.h" #include <fstream> #ifndef MAX #define MAX(a,b) ((a > b) ? a : b) #endif #define REAL sizeof(float) #define N 128 #define BLOCK_SIZE 128 #define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE) template <typename T> inline void printmatrix( T mat, int rows, int cols); template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old); template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h = 1); template<typename T> int main () { float t0 = 0.; //float tn = 0.; float h = 0.1; float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;} float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N); float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N); float* J = (float*)malloc(REAL*N*N); float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A); float *d_u0; float *d_un; float *d_un_old; float *d_J; float *d_A; checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0); checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un); checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old); checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL); checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL); checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL); checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL); checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL); dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n"; dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n"; size_t shm_size = N*REAL; //std::cout << shm_size << "\n"; //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS, //WHILE NOW THE GRID IS 1D jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old); checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J); printmatrix( J, N, N); checkCudaErrors(cudaDeviceReset()); free(u0); free(un); free(un_old); free(J); } template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old) { T cgamma = 2 - sqrtf(2); const unsigned int t = threadIdx.x; const unsigned int b = blockIdx.x; const unsigned int tid = t + b * blockDim.x; /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE]; /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE]; __shared__ T sm_temp_du; T* temp_du = &sm_temp_du; //HERE IS A RELEVANT CHANGE (*) if ( t < BLOCK_SIZE && b < NUM_BLOCKS ) { temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t])); temp_dx[b][t] = un[t]; //printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid); //HERE IS A NOTE (**) if ( t == b ) { //printf("t = %d, b = %d \n",t, b); if ( tn == t0 ) { *temp_du = u0[t]*0.001; temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] ); temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] ); } else { *temp_du = MAX( un[t] - un_old[t], 10e-6 ); temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; } ; } //printf("du[%d] %f\n", tid, (*temp_du)); __syncthreads(); //printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]); //printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]); //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f); //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f); matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f ); matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f ); __syncthreads(); //printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t])); //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du); //if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1)); J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du); } } template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h ) { __shared__ float temp_u; temp_u = u[threadIdx.x]; res[threadIdx.x] = h*powf( (temp_u), 2); } template <typename T> inline void printmatrix( T mat, int rows, int cols) { std::ofstream matrix_out; matrix_out.open( "heat_matrix.txt", std::ofstream::out); for( int i = 0; i < rows; i++) { for( int j = 0; j <cols; j++) { double next = mat[i + N*j]; matrix_out << ( (next >= 0) ? " " : "") << next << " "; } matrix_out << "\n"; } }

#包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括“helper_cuda.h” #包括“helper_string.h” #包括 #ifndef最大值 #定义最大值（a，b）（（a>b）？a:b） #恩迪夫 #定义实尺寸（浮动） #定义N 128 #定义块大小128 #定义块数（（N*N+块大小-1）/块大小）模板内联void打印矩阵（T mat，int rows，int cols）；模板 __全局无效雅可比核（常数T*A，T*J，常数T t0，常数T tn，常数T h，常数T*u0，常数T*un，常数T*un旧）；模板 __设备无效功能（常数T，常数T u[]，常数T h=1）；模板 int main（） { 浮点数t0=0；//浮点数tn=0。；浮动h=0.1； float*u0=（float*）malloc（REAL*N）；for（inti=0；idim3 dimGrid（NUM_BLOCKS）；std:：cout CUDA不是C！@Olaf这是一个很好的观点，只是标题CUDA C编程指南有误导性…请随时向Nvidia报告，堆栈溢出或C标准委员会不能对名称“C”的任何错误使用负责。在添加一个标签之前，请阅读标签信息。标签与添加标签以构成语义的程序不同。每个标签都代表它自己。你的MCVE与你在问题中发布的代码不匹配。关于MCVE代码/链接（启动1D网格和块）您仅获得1份du[0] 打印输出的原因是由于语句if（tid tid 是一个全局唯一的线程索引。因此，第一个块得到16个线程，其tid 值为0..15。第二个块得到16个线程，其tid 值为16..31，依此类推，第三个和第四个块。现在，N 是8。这些块中哪一个线程的tid 小于8？只有t第一个块。所以对于第一个块，唯一的时间（t==b）是线程0。所以不是你自己的个人调试聊天室。它是用来提问和回答的，每个问题一个问题。当你问一个调试问题时，你应该在问题本身（read）中提供a，而不是在一个会消失a的外部链接中 jacobian_kernel <<< N, N >>> (...) #include <stdio.h> #include <stdlib.h> #include <iostream> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #include "helper_cuda.h" #include "helper_string.h" #include <fstream> #ifndef MAX #define MAX(a,b) ((a > b) ? a : b) #endif #define REAL sizeof(float) #define N 128 #define BLOCK_SIZE 128 #define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE) template <typename T> inline void printmatrix( T mat, int rows, int cols); template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old); template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h = 1); template<typename T> int main () { float t0 = 0.; //float tn = 0.; float h = 0.1; float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;} float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N); float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N); float* J = (float*)malloc(REAL*N*N); float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A); float *d_u0; float *d_un; float *d_un_old; float *d_J; float *d_A; checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0); checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un); checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old); checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL); checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL); checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL); checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL); checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL); dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n"; dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n"; size_t shm_size = N*REAL; //std::cout << shm_size << "\n"; //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS, //WHILE NOW THE GRID IS 1D jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old); checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J); printmatrix( J, N, N); checkCudaErrors(cudaDeviceReset()); free(u0); free(un); free(un_old); free(J); } template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old) { T cgamma = 2 - sqrtf(2); const unsigned int t = threadIdx.x; const unsigned int b = blockIdx.x; const unsigned int tid = t + b * blockDim.x; /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE]; /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE]; __shared__ T sm_temp_du; T* temp_du = &sm_temp_du; //HERE IS A RELEVANT CHANGE (*) if ( t < BLOCK_SIZE && b < NUM_BLOCKS ) { temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t])); temp_dx[b][t] = un[t]; //printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid); //HERE IS A NOTE (**) if ( t == b ) { //printf("t = %d, b = %d \n",t, b); if ( tn == t0 ) { *temp_du = u0[t]*0.001; temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] ); temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] ); } else { *temp_du = MAX( un[t] - un_old[t], 10e-6 ); temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; } ; } //printf("du[%d] %f\n", tid, (*temp_du)); __syncthreads(); //printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]); //printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]); //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f); //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f); matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f ); matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f ); __syncthreads(); //printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t])); //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du); //if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1)); J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du); } } template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h ) { __shared__ float temp_u; temp_u = u[threadIdx.x]; res[threadIdx.x] = h*powf( (temp_u), 2); } template <typename T> inline void printmatrix( T mat, int rows, int cols) { std::ofstream matrix_out; matrix_out.open( "heat_matrix.txt", std::ofstream::out); for( int i = 0; i < rows; i++) { for( int j = 0; j <cols; j++) { double next = mat[i + N*j]; matrix_out << ( (next >= 0) ? " " : "") << next << " "; } matrix_out << "\n"; } }