Visual c++ 在.cpp文件中分离cuda主机代码

Visual c++ 在.cpp文件中分离cuda主机代码,visual-c++,cuda,Visual C++,Cuda,main.cpp #include<iostream> #include "cuda.h" using namespace std; void cuda_calculation(); int main() { cuda_calculation(); return 0; } #include <stdio.h> #include <cuda.h> #include "cu.h" void cuda_calculation() {

main.cpp

#include<iostream>
#include "cuda.h"


using namespace std;
void cuda_calculation();


int main()
{
    cuda_calculation();
    return 0;
}
#include <stdio.h>
#include <cuda.h>
#include "cu.h"




void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  void call(n_blocks, block_size,&a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
}
#include<iostream>
#include "cud.h"


using namespace std;



int main()
{
    cuda_calculation();
    return 0;
}
cuda.cpp

#include<iostream>
#include "cuda.h"


using namespace std;
void cuda_calculation();


int main()
{
    cuda_calculation();
    return 0;
}
#include <stdio.h>
#include <cuda.h>
#include "cu.h"




void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  void call(n_blocks, block_size,&a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
}
#include<iostream>
#include "cud.h"


using namespace std;



int main()
{
    cuda_calculation();
    return 0;
}

有一些错误:
void cuda_calculation()需要通过头文件(cu.h)对main.cpp可见


<>也>强>确保用NVCC 编译你的.Cu文件,而不是标准C++文件。使用CUDA编译规则简化此过程(默认情况下作为CUDA工具包的一部分安装)

经过长时间的试用,我得到了正确的输出

要在cpp文件中包括cuda标识符,我们不仅需要包括cuda.h,还需要包括cuda_runtime.h

#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>

#include "cu.h"
#include "cud.h"





//void call(int , int ,float * , int  );

void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
   call(n_blocks, block_size,a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); 
  cudaFree(a_d);
}
cuda.cppas

#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>

#include "cu.h"
#include "cud.h"





//void call(int , int ,float * , int  );

void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
   call(n_blocks, block_size,a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); 
  cudaFree(a_d);
}
cu.h

void call(int , int ,float*  , int  );
void cuda_calculation();
void call(int , int ,float*  , int  );
cu.cu

#include <stdio.h>
#include "cu.h"
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N) a[idx] = a[idx] * a[idx];
}


//}


void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}
#include <stdio.h>
#include "cu.h"
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N) a[idx] = a[idx] * a[idx];
}


//}


void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}
#包括
#包括“cu.h”
#包括
//在CUDA设备上执行的内核
__全局数组(浮点*a,整数N)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx(c,d);
}

最近,我在Ma.CPP文件中包含了“CUDA .CPP”,并且同样的错误预防,确保用NVCC编译你的.Cu文件,发布整个命令行来帮助我们调试你的问题。我已经使用VisualStudio运行代码,并且总代码被张贴在上面。这里有两种文件:常规C++源主机文件。(将在CPU上运行)和CuCuDA C++源文件(将在GPU上运行)。您需要用VisualStudio来编译第一个,就像您一直以来所做的那样,后者使用英伟达CUDA工具包提供的一个名为“NVCC .exe”的工具。总之,您需要访问Cu.Cu文件的属性,并确保“CUDA C构建”。如果您想让它变得简单,请先阅读有关nvcc以及如何在visual studio之外编译CUDA文件的信息。我已在visual studio中正确配置了CUDA 5.0,并且在单个.cu文件中使用该代码时正在运行基本代码,但当我尝试在单独的文件中管理该代码时,会出现错误。您是否正确设置了构建自定义是的,我有,现在我已经解决了,答案如下