CUDA中的二分法
我试图在CUDA中实现二分法。此方法能够从应用程序()中近似特征值。我有一些关于如何做的问题。 这是我的密码:CUDA中的二分法,cuda,parallel-processing,kernel,gpu,bisection,Cuda,Parallel Processing,Kernel,Gpu,Bisection,我试图在CUDA中实现二分法。此方法能够从应用程序()中近似特征值。我有一些关于如何做的问题。 这是我的密码: #include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> double f(double x) { //return ((5*sin(2*x))-(52*cos(2*x)))+50; return cos(x); } doubl
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
double absoluto(double n){
if(n<0) n=n*-1;
return(n);
}
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n)
c[id] = (a[id] + b[id])/2;
}
int main( int argc, char* argv[] )
{
int i=0;
double malla = 1.0;
double x1=0.0 , x2=10.0 , j=0.0;
int n = (int)x2/(int)malla;
double *host_a;
double *host_b;
double *host_c;
double *dev_a;
double *dev_b;
double *dev_c;
size_t bytes = n*sizeof(double);
host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);
cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);
// Initialize vectors on host
for( j = 0.0; j < n; j=j+1.0 ) {
if((f(x1)*f(x1+malla))>0){
x1 = x1 + malla;
i++;
}
else{
host_a[i] = x1;
host_b[i] = x1+malla;
x1 = x1 + malla;
i++;
}
}
int blockSize, gridSize;
blockSize = 1024;
gridSize = (int)ceil((float)n/blockSize);
i=0;
// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);
// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );
i=0;
for(j=0.0;j<n;j++){
printf("%f\n",host_c[i])
i++;
}
// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
// Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
#包括
#包括
#包括
#包括
双f(双x)
{
//返回((5*sin(2*x))-(52*cos(2*x)))+50;
返回cos(x);
}
双绝对(双n){
如果(n0){
x1=x1+malla;
i++;
}
否则{
主机a[i]=x1;
主机_b[i]=x1+malla;
x1=x1+malla;
i++;
}
}
int blockSize,gridSize;
块大小=1024;
gridSize=(int)ceil((float)n/块大小);
i=0;
//将主机向量复制到设备
cudaMemcpy(dev_a、host_a、bytes、cudaMemcpyHostToDevice);
cudaMemcpy(dev_b、host_b、bytes、cudaMemcpyHostToDevice);
//执行内核
biseccion(dev_a、dev_b、dev_c、n);
//将阵列复制回主机
cudaMemcpy(主机c、开发c、字节、cudaMemcpyDeviceToHost);
i=0;
对于(j=0.0;j,您需要定义停止标准或收敛标准——何时停止近似?假设您的停止标准只是对分循环的多次迭代。我们可以将其作为参数传递给内核
然后我们可以像这样重新编写内核:
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
1.570796
4.712389
7.853982
10.995574
14.137167
17.278760
20.420352
23.561945
26.703538
29.845130
请注意,上面的更改并没有特别优化。例如,我们存储在全局内存中的变量有大量的重复使用,如a[id]
、b[id]
、和c[id]
。我们可能会使用共享内存(甚至只是本地线程变量——数量不多),并且只在完成循环后将结果写回全局内存
我必须对您的代码进行一些其他更改,以使其以对我有意义的方式工作。以下是您代码的完整修改版本:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
double absoluto(double n){
if(n<0) n=n*-1;
return(n);
}
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
int main( int argc, char* argv[] )
{
int i=0;
int loops=1000; // this is the number of bisection iterations to run
double malla = 1.0;
double x1=0.0 , x2=10.0 , j=0.0;
int n = (int)x2/(int)malla;
double *host_a;
double *host_b;
double *host_c;
double *dev_a;
double *dev_b;
double *dev_c;
size_t bytes = n*sizeof(double);
host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);
cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);
// Initialize vectors on host
while( i < n) {
if((f(x1)*f(x1+malla))>0){
x1 = x1 + malla;
}
else{
host_a[i] = x1;
host_b[i] = x1+malla;
x1 = x1 + malla;
i++;
}
}
int blockSize, gridSize;
blockSize = 256;
gridSize = (int)ceil((float)n/blockSize);
i=0;
// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);
// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n, loops);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );
i=0;
for(j=0.0;j<n;j++){
printf("%f\n",host_c[i]);
i++;
}
// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
// Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
您会注意到,第一个结果是pi/2,每个后续结果都添加了pi,因此我认为这是cos(x)的前10个根的正确结果.您需要定义停止标准或收敛标准——何时停止近似?假设您的停止标准只是对分循环的多次迭代。我们可以将其作为参数传递给内核
然后我们可以像这样重新编写内核:
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
1.570796
4.712389
7.853982
10.995574
14.137167
17.278760
20.420352
23.561945
26.703538
29.845130
请注意,上面的更改并没有特别优化。例如,我们存储在全局内存中的变量有大量的重复使用,如a[id]
、b[id]
、和c[id]
。我们可能会使用共享内存(甚至只是本地线程变量——数量不多),并且只在完成循环后将结果写回全局内存
我必须对您的代码进行一些其他更改,以使其以对我有意义的方式工作。以下是您代码的完整修改版本:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
double absoluto(double n){
if(n<0) n=n*-1;
return(n);
}
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
int main( int argc, char* argv[] )
{
int i=0;
int loops=1000; // this is the number of bisection iterations to run
double malla = 1.0;
double x1=0.0 , x2=10.0 , j=0.0;
int n = (int)x2/(int)malla;
double *host_a;
double *host_b;
double *host_c;
double *dev_a;
double *dev_b;
double *dev_c;
size_t bytes = n*sizeof(double);
host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);
cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);
// Initialize vectors on host
while( i < n) {
if((f(x1)*f(x1+malla))>0){
x1 = x1 + malla;
}
else{
host_a[i] = x1;
host_b[i] = x1+malla;
x1 = x1 + malla;
i++;
}
}
int blockSize, gridSize;
blockSize = 256;
gridSize = (int)ceil((float)n/blockSize);
i=0;
// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);
// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n, loops);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );
i=0;
for(j=0.0;j<n;j++){
printf("%f\n",host_c[i]);
i++;
}
// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
// Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
您会注意到,第一个结果是pi/2,每个后续结果都添加了pi,因此我认为这是cos(x)的前10个根的正确结果。Robert Crovella已经指出,您的问题是停止规则,它是根据迭代次数给出的
对于一个最小的更复杂的二等分,停止规则也可以与目标精度相关。下面我提供CUDA中的二分方法的一个版本,它可以从C++书中的数值配方中找到,也可以设置目标精度。
也许,通过调整
特征值CUDA SDK样本中利用的对分核,可以获得计算上更复杂的对分
新版本的方法似乎更准确。以下是一些结果:
No target accuracy
1.571289062500
4.453613281250
6.504882812500
10.546875000000
13.171386718750
Target accuracy
1.570796326795
4.712388980385
7.853981633975
10.995574287564
14.137166941154
Actual roots
1.570796326794897
4.712388980384690
7.853981633974483
10.995574287564276
14.137166941154069
通过上述书中提供的方法,可以再次实现更好的初始括号
这是密码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <math_constants.h>
#define BLOCKSIZE 512
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* FUNCTION TO SEARCH THE ROOTS FOR */
/************************************/
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
/***************************************/
/* BISECTION KERNEL - ORIGINAL VERSION */
/***************************************/
__global__ void bisection(double *a, double *b, double *c, int N, int loopcnt)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (tid < N)
while (loops < loopcnt){
c[tid] = (a[tid] + b[tid])/2;
if ((f(c[tid]) * f(a[tid])) < 0) b[tid] = c[tid];
else a[tid] = c[tid];
loops++;
}
}
/************************************************/
/* BISECTION KERNEL - NUMERICAL RECIPES VERSION */
/************************************************/
// --- Using bisection, return the root of a function func known to lie between x1 and x2.
// The root will be refined until its accuracy is xacc.
__global__ void bisection_NR(const double *d_x1, const double *d_x2, double *d_roots, const double xacc, const int loopcnt, const int N) {
// --- loopcnt is the maximum allowed number of bisections.
int tid = blockIdx.x*blockDim.x+threadIdx.x;
if (tid < N) {
double dx,xmid,rtb;
double f1=f(d_x1[tid]);
double fmid=f(d_x2[tid]);
if (f1*fmid >= 0.0) d_roots[tid] = CUDART_NAN;
rtb = f1 < 0.0 ? (dx=d_x2[tid]-d_x1[tid],d_x1[tid]) : (dx=d_x1[tid]-d_x2[tid],d_x2[tid]); // --- Orient the search so that f>0
for (int j=0;j<loopcnt;j++) { // --- lies at x+dx.
fmid=f(xmid=rtb+(dx *= 0.5)); // --- Bisection loop.
if (fmid <= 0.0) rtb=xmid;
if (abs(dx) < xacc || fmid == 0.0) { d_roots[tid]=rtb; return; }
}
d_roots[tid] = CUDART_NAN;
}
}
/*******/
/* INT */
/*******/
int main()
{
int loops=100000; // --- Number of bisection iterations to run
double x1=0.0, x2=10.0; // --- Minimum and maximum values of the search interval
double Deltax = 1.0; // --- Sampling step of the search interval
int N = (int)x2/(int)Deltax; // --- Number of search intervales
// --- Host-side memory allocations
double *host_a = (double*)malloc(N*sizeof(double));
double *host_b = (double*)malloc(N*sizeof(double));
double *host_c = (double*)malloc(N*sizeof(double));
// --- Device-side memory allocations
double *dev_a; gpuErrchk(cudaMalloc(&dev_a, N*sizeof(double)));
double *dev_b; gpuErrchk(cudaMalloc(&dev_b, N*sizeof(double)));
double *dev_c; gpuErrchk(cudaMalloc(&dev_c, N*sizeof(double)));
// --- Initialize vectors on host
int i=0;
while(i < N) {
if((f(x1)*f(x1+Deltax))>0) x1 = x1 + Deltax;
else {
host_a[i] = x1;
host_b[i] = x1+Deltax;
x1 = x1 + Deltax;
i++;
}
}
// --- Copy host vectors to device
gpuErrchk(cudaMemcpy(dev_a, host_a, N*sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_b, host_b, N*sizeof(double), cudaMemcpyHostToDevice));
bisection<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
printf("\n");
bisection_NR<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, 2.5e-13, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
// --- Release device memory
gpuErrchk(cudaFree(dev_a));
gpuErrchk(cudaFree(dev_b));
gpuErrchk(cudaFree(dev_c));
// --- Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
#包括
#包括
#包括
#包括
#定义块大小512
/*******************/
/*iDivUp函数*/
/*******************/
intidivup(inta,intb){返回((a%b)!=0)?(a/b+1):(a/b);}
/********************/
/*CUDA错误检查*/
/********************/
#定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
内联void gpuAssert(cudaError\u t代码,char*文件,int行,bool abort=true)
{
如果(代码!=cudaSuccess)
{
fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
如果(中止)退出(代码);
}
}
/************************************/
/*函数来搜索根*/
/************************************/
__主机设备双f(双x)
{
//返回((5*sin(2*x))-(52*cos(2*x)))+50;
返回cos(x);
}
/***************************************/
/*二分法内核-原始版本*/
/***************************************/
__全局无效二等分(双*a、双*b、双*c、整数N、整数环Cnt)
{
int tid=blockIdx.x*blockDim.x+threadIdx.x;
int循环=0;
如果(tid=0.0)d_根[tid]=CUDART_NAN;
rtb=f1<0.0?(dx=d_x2[tid]-d_x1[tid],d_x1[tid]):(dx=d_x1[tid]-d_x2[tid],d_x2[