Visual c++ 在.cpp文件中分离cuda主机代码
main.cppVisual c++ 在.cpp文件中分离cuda主机代码,visual-c++,cuda,Visual C++,Cuda,main.cpp #include<iostream> #include "cuda.h" using namespace std; void cuda_calculation(); int main() { cuda_calculation(); return 0; } #include <stdio.h> #include <cuda.h> #include "cu.h" void cuda_calculation() {
#include<iostream>
#include "cuda.h"
using namespace std;
void cuda_calculation();
int main()
{
cuda_calculation();
return 0;
}
#include <stdio.h>
#include <cuda.h>
#include "cu.h"
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
void call(n_blocks, block_size,&a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
#include<iostream>
#include "cud.h"
using namespace std;
int main()
{
cuda_calculation();
return 0;
}
cuda.cpp
#include<iostream>
#include "cuda.h"
using namespace std;
void cuda_calculation();
int main()
{
cuda_calculation();
return 0;
}
#include <stdio.h>
#include <cuda.h>
#include "cu.h"
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
void call(n_blocks, block_size,&a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
#include<iostream>
#include "cud.h"
using namespace std;
int main()
{
cuda_calculation();
return 0;
}
有一些错误:
void cuda_calculation()代码>需要通过头文件(cu.h)对main.cpp可见
<>也>强>确保用NVCC 编译你的.Cu文件,而不是标准C++文件。使用CUDA编译规则简化此过程(默认情况下作为CUDA工具包的一部分安装)经过长时间的试用,我得到了正确的输出
要在cpp文件中包括cuda标识符,我们不仅需要包括cuda.h,还需要包括cuda_runtime.h
#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>
#include "cu.h"
#include "cud.h"
//void call(int , int ,float * , int );
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
call(n_blocks, block_size,a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h);
cudaFree(a_d);
}
cuda.cppas
#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>
#include "cu.h"
#include "cud.h"
//void call(int , int ,float * , int );
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
call(n_blocks, block_size,a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h);
cudaFree(a_d);
}
cu.h
void call(int , int ,float* , int );
void cuda_calculation();
void call(int , int ,float* , int );
cu.cu
#include <stdio.h>
#include "cu.h"
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
//}
void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}
#include <stdio.h>
#include "cu.h"
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
//}
void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}
#包括
#包括“cu.h”
#包括
//在CUDA设备上执行的内核
__全局数组(浮点*a,整数N)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx(c,d);
}
最近,我在Ma.CPP文件中包含了“CUDA .CPP”,并且同样的错误预防,确保用NVCC编译你的.Cu文件,发布整个命令行来帮助我们调试你的问题。我已经使用VisualStudio运行代码,并且总代码被张贴在上面。这里有两种文件:常规C++源主机文件。(将在CPU上运行)和CuCuDA C++源文件(将在GPU上运行)。您需要用VisualStudio来编译第一个,就像您一直以来所做的那样,后者使用英伟达CUDA工具包提供的一个名为“NVCC .exe”的工具。总之,您需要访问Cu.Cu文件的属性,并确保“CUDA C构建”。如果您想让它变得简单,请先阅读有关nvcc以及如何在visual studio之外编译CUDA文件的信息。我已在visual studio中正确配置了CUDA 5.0,并且在单个.cu文件中使用该代码时正在运行基本代码,但当我尝试在单独的文件中管理该代码时,会出现错误。您是否正确设置了构建自定义是的,我有,现在我已经解决了,答案如下