用CUDA求解二维扩散(热)方程
我正在学习CUDA,尝试解决一些标准问题。作为一个例子,我正在用下面的代码求解二维扩散方程。但我的结果与标准结果不同,我无法理解这一点用CUDA求解二维扩散(热)方程,cuda,nvidia,differential-equations,Cuda,Nvidia,Differential Equations,我正在学习CUDA,尝试解决一些标准问题。作为一个例子,我正在用下面的代码求解二维扩散方程。但我的结果与标准结果不同,我无法理解这一点 //kernel definition __global__ void diffusionSolver(double* A, double * old,int n_x,int n_y) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y
//kernel definition
__global__ void diffusionSolver(double* A, double * old,int n_x,int n_y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i*(n_x-i-1)*j*(n_y-j-1)!=0)
A[i+n_y*j] = A[i+n_y*j] + (old[i-1+n_y*j]+old[i+1+n_y*j]+
old[i+(j-1)*n_y]+old[i+(j+1)*n_y] -4*old[i+n_y*j])/40;
}
int main()
{
int i,j ,M;
M = n_y ;
phi = (double *) malloc( n_x*n_y* sizeof(double));
phi_old = (double *) malloc( n_x*n_y* sizeof(double));
dummy = (double *) malloc( n_x*n_y* sizeof(double));
int iterationMax =10;
//phase initialization
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
if((.4*n_x-i)*(.6*n_x-i)<0)
phi[i+M*j] = -1;
else
phi[i+M*j] = 1;
phi_old[i+M*j] = phi[i+M*j];
}
}
double *dev_phi;
cudaMalloc((void **) &dev_phi, n_x*n_y*sizeof(double));
dim3 threadsPerBlock(100,10);
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
//start iterating
for(int z=0; z<iterationMax; z++)
{
//copy array on host to device
cudaMemcpy(dev_phi, phi, n_x*n_y*sizeof(double),
cudaMemcpyHostToDevice);
//call kernel
diffusionSolver<<<numBlocks, threadsPerBlock>>>(dev_phi, phi_old,n_x,n_y);
//get updated array back on host
cudaMemcpy(phi, dev_phi,n_x*n_y*sizeof(double), cudaMemcpyDeviceToHost);
//old values will be assigned new values
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
phi_old[i+n_y*j] = phi[i+n_y*j];
}
}
}
return 0;
}
//内核定义
__全局无效扩散解决方案(双*A,双*old,整数n\ux,整数n\uy)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=blockIdx.y*blockDim.y+threadIdx.y;
如果(i*(n_x-i-1)*j*(n_y-j-1)!=0)
A[i+n_y*j]=A[i+n_y*j]+(旧[i-1+n_y*j]+旧[i+1+n_y*j]+
old[i+(j-1)*n_-y]+old[i+(j+1)*n_-y]-4*old[i+n_-y*j])/40;
}
int main()
{
int i,j,M;
M=n_y;
phi=(双*)malloc(n_x*n_y*sizeof(双));
phi_old=(双*)malloc(n_x*n_y*sizeof(双));
dummy=(双*)malloc(n_x*n_y*sizeof(双));
int iterationMax=10;
//阶段初始化
对于(j=0;j此处:
您正在除以40(整数),这可能导致错误的扩散率。实际上可能导致无扩散
但A是一组双打
将漫反射速率除以40.0,然后查看它是否有效
如果这是来自Jos Stam的解算器,它应该是4.0而不是40
还有一件事:
-4*old[i+n_y*j])/40;
这里是4(整数)的乘积。这也会导致积分转换
这:
减少了一些错误
祝您度过愉快的一天。您犯的一个大错误是,phi_old被传递到内核并由内核使用,但这是一个主机指针。
Malloc使用cudaMalloc创建一个dev_phi_old。将其设置为默认值,并在进入z循环之前第一次将其复制到GPU。talonmies、brano和huseyin已经指出了代码中的一些错误
扩散(热)方程是CUDA可解偏微分方程的经典例子之一。在CUDA的第7章中也有一个完整的例子
作为未来用户的参考,我在下面提供了一个完整的示例,包括CPU和GPU代码。我没有像Talonmes建议的那样交换指针,而是将两个Jacobi迭代浓缩在一个循环中
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Utilities.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
/***********************************/
/* JACOBI ITERATION FUNCTION - GPU */
/***********************************/
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/***********************************/
/* JACOBI ITERATION FUNCTION - CPU */
/***********************************/
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
{
for(int iter=0; iter<MAX_ITER; iter=iter+2)
{
// --- Only update "interior" (not boundary) node points
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T[(i+1) + NX*j];
float T_W = T[(i-1) + NX*j];
float T_N = T[i + NX*(j+1)];
float T_S = T[i + NX*(j-1)];
T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T_new[(i+1) + NX*j];
float T_W = T_new[(i-1) + NX*j];
float T_N = T_new[i + NX*(j+1)];
float T_S = T_new[i + NX*(j-1)];
T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
}
}
/******************************/
/* TEMPERATURE INITIALIZATION */
/******************************/
void Initialize(float * __restrict h_T, const int NX, const int NY)
{
// --- Set left wall to 1
for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
}
/********/
/* MAIN */
/********/
int main()
{
const int NX = 256; // --- Number of discretization points along the x axis
const int NY = 256; // --- Number of discretization points along the y axis
const int MAX_ITER = 1; // --- Number of Jacobi iterations
// --- CPU temperature distributions
float *h_T = (float *)calloc(NX * NY, sizeof(float));
float *h_T_old = (float *)calloc(NX * NY, sizeof(float));
Initialize(h_T, NX, NY);
Initialize(h_T_old, NX, NY);
float *h_T_GPU_result = (float *)malloc(NX * NY * sizeof(float));
// --- GPU temperature distribution
float *d_T; gpuErrchk(cudaMalloc((void**)&d_T, NX * NY * sizeof(float)));
float *d_T_old; gpuErrchk(cudaMalloc((void**)&d_T_old, NX * NY * sizeof(float)));
gpuErrchk(cudaMemcpy(d_T, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_old, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
// --- Grid size
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
// --- Jacobi iterations on the host
Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
// --- Jacobi iterations on the device
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T, d_T_old, NX, NY); // --- Update d_T_old starting from data stored in d_T
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T starting from data stored in d_T_old
}
// --- Copy result from device to host
gpuErrchk(cudaMemcpy(h_T_GPU_result, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
// --- Calculate percentage root mean square error between host and device results
float sum = 0., sum_ref = 0.;
for (int j=0; j<NY; j++)
for (int i=0; i<NX; i++) {
sum = sum + (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]);
sum_ref = sum_ref + h_T[j * NX + i] * h_T[j * NX + i];
}
printf("Percentage root mean square error = %f\n", 100.*sqrt(sum / sum_ref));
// --- Release host memory
free(h_T);
free(h_T_GPU_result);
// --- Release device memory
gpuErrchk(cudaFree(d_T));
gpuErrchk(cudaFree(d_T_old));
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“Utilities.cuh”
#定义块大小×16
#定义块大小16
/***********************************/
/*JACOBI迭代函数-GPU*/
/***********************************/
__全局无效Jacobi迭代器GPU(常量浮点*\uuuu限制\uuuuuu旧,浮点*\uuu限制\uuuuu新,常量整数NX,常量整数NY)
{
const int i=blockIdx.x*blockDim.x+threadIdx.x;
const int j=blockIdx.y*blockDim.y+threadIdx.y;
//N
int P=i+j*NX;//节点(i,j)|
int N=i+(j+1)*NX;//节点(i,j+1)|
int S=i+(j-1)*NX;//节点(i,j-1)W----P----E
int E=(i+1)+j*NX;//节点(i+1,j)|
int W=(i-1)+j*NX;//节点(i-1,j)|
//
//---仅更新“内部”(非边界)节点点
如果(i>0&&i0&&jthanks for reply tugrul,我尝试使用40.0,但结果仍然不同(百分比误差~40)。我使用40.0而不是4.0,以确保它在数值上稳定。请看一下将数组复制到主机的过程(反之亦然)看看是否是正确的方法,特别是这应该被多次执行。正如你提到的,我的数组值根本没有改变..!!我已经尝试了你的解决方案,但它也不起作用。你能建议任何方法,让我确保内核被真正调用吗?gpu中的大小是两倍吗?gpu中可能是128位,而gpu中可能是64位CPU你能告诉我你得到的和你想要的有什么不同吗?旧[i-1+n_y*j]中的值的大小是多少?“旧值将被赋予新值”节是没有意义的。您可以执行设备到设备memcpy并消除传输和主机端循环,或者更好的是,只交换指针值。@Talonmes,谢谢您的建议。我将交换指针值(看起来很简单)您在代码中的任何地方都没有说明n_x和n_y的值是什么,并且您在代码中执行任何错误检查。每个CUDA API调用都会返回一个状态。您应该检查它们,以确保内核实际运行并且代码正确执行。除了您指出的错误之外,我还必须添加额外的条件在内核调用中(if),原因是您需要过滤掉线程,这将导致索引越界。有时,对于给定的维度n_X和n_Y,您无法准确启动X和Y中的线程数量。是否需要两个新旧变量?
-4.0*old[i+n_y*j])/40.0;
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Utilities.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
/***********************************/
/* JACOBI ITERATION FUNCTION - GPU */
/***********************************/
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/***********************************/
/* JACOBI ITERATION FUNCTION - CPU */
/***********************************/
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
{
for(int iter=0; iter<MAX_ITER; iter=iter+2)
{
// --- Only update "interior" (not boundary) node points
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T[(i+1) + NX*j];
float T_W = T[(i-1) + NX*j];
float T_N = T[i + NX*(j+1)];
float T_S = T[i + NX*(j-1)];
T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T_new[(i+1) + NX*j];
float T_W = T_new[(i-1) + NX*j];
float T_N = T_new[i + NX*(j+1)];
float T_S = T_new[i + NX*(j-1)];
T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
}
}
/******************************/
/* TEMPERATURE INITIALIZATION */
/******************************/
void Initialize(float * __restrict h_T, const int NX, const int NY)
{
// --- Set left wall to 1
for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
}
/********/
/* MAIN */
/********/
int main()
{
const int NX = 256; // --- Number of discretization points along the x axis
const int NY = 256; // --- Number of discretization points along the y axis
const int MAX_ITER = 1; // --- Number of Jacobi iterations
// --- CPU temperature distributions
float *h_T = (float *)calloc(NX * NY, sizeof(float));
float *h_T_old = (float *)calloc(NX * NY, sizeof(float));
Initialize(h_T, NX, NY);
Initialize(h_T_old, NX, NY);
float *h_T_GPU_result = (float *)malloc(NX * NY * sizeof(float));
// --- GPU temperature distribution
float *d_T; gpuErrchk(cudaMalloc((void**)&d_T, NX * NY * sizeof(float)));
float *d_T_old; gpuErrchk(cudaMalloc((void**)&d_T_old, NX * NY * sizeof(float)));
gpuErrchk(cudaMemcpy(d_T, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_old, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
// --- Grid size
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
// --- Jacobi iterations on the host
Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
// --- Jacobi iterations on the device
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T, d_T_old, NX, NY); // --- Update d_T_old starting from data stored in d_T
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T starting from data stored in d_T_old
}
// --- Copy result from device to host
gpuErrchk(cudaMemcpy(h_T_GPU_result, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
// --- Calculate percentage root mean square error between host and device results
float sum = 0., sum_ref = 0.;
for (int j=0; j<NY; j++)
for (int i=0; i<NX; i++) {
sum = sum + (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]);
sum_ref = sum_ref + h_T[j * NX + i] * h_T[j * NX + i];
}
printf("Percentage root mean square error = %f\n", 100.*sqrt(sum / sum_ref));
// --- Release host memory
free(h_T);
free(h_T_GPU_result);
// --- Release device memory
gpuErrchk(cudaFree(d_T));
gpuErrchk(cudaFree(d_T_old));
return 0;
}