C# Cudafy内核不编译
使用Cudafy完成我的第一步,并尝试编写一个函数,该函数将获取其线程的位置,并在此基础上将一些int值保存到数组元素中。 我的代码: 因此,我无法计算这里的C# Cudafy内核不编译,c#,cuda,cudafy.net,C#,Cuda,Cudafy.net,使用Cudafy完成我的第一步,并尝试编写一个函数,该函数将获取其线程的位置,并在此基础上将一些int值保存到数组元素中。 我的代码: 因此,我无法计算这里的threadY。如果我在计算中使用多个变量,Cudafy translating类将抛出一个错误(CudafyModule cm=CudafyTranslator.Cudafy();抛出一个Cudafy.cudafylanaguageexception) 我做错了什么 更新: 这是在GPU上运行内核的代码: public void RunT
threadY
。如果我在计算中使用多个变量,Cudafy translating类将抛出一个错误(CudafyModule cm=CudafyTranslator.Cudafy();
抛出一个Cudafy.cudafylanaguageexception)
我做错了什么
更新:
这是在GPU上运行内核的代码:
public void RunTest2()
{
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
CudafyModule km = CudafyTranslator.Cudafy();
gpu.LoadModule(km);
int size = 20 * 20;
int[] allPixels = new int[size];
int[] dev_result = gpu.Allocate<int>(size);
dim3 blocksInGrid = new dim3(5, 5);
dim3 threadsPerBlock = new dim3(4, 4);
gpu.Launch(blocksInGrid, threadsPerBlock).GenerateRipples(dev_result);
gpu.CopyFromDevice(dev_result, allPixels);
gpu.FreeAll();
}
public void RunTest2()
{
GPGPU gpu=CudafyHost.GetDevice(CudafyModes.Target,CudafyModes.DeviceId);
CudafyModule km=CudafyTranslator.Cudafy();
gpu.LoadModule(公里);
整数大小=20*20;
int[]所有像素=新的int[大小];
int[]dev_result=gpu.Allocate(大小);
dim3 blocksInGrid=新的dim3(5,5);
dim3螺纹防松=新dim3(4,4);
启动(blocksInGrid,threadsPerBlock).GenerateRipples(开发结果);
CopyFromDevice(开发结果,所有像素);
gpu.FreeAll();
}
我们需要了解您是如何启动内核的,上面的代码应该可以正常运行。我创建了一个运行良好的测试类,并提供了如何准备内核网格/块/线程维度的示例。
如果你想看到很好的例子下载Cudafy源代码并编译CudafyExamples项目,请查看他们是如何准备和使用Cudafy的功能的
**注意:在我发布第一个类之前,我一定抽了一些很好的东西,我忽略了验证它没有产生内存访问冲突
修正了下面的类,没有冲突
在和上查找好的示例
使用系统;
使用System.Collections.Generic;
使用系统诊断;
使用System.Linq;
使用系统文本;
使用Cudafy;
使用Cudafy.Host;
使用Cudafy.Translator;
命名空间FxKernelTest
{
公共类测试
{
公共GPU fxgpu;
公共常数int N=1024*64;
public void extestkernel()
{
GPGPU gpu=CudafyHost.GetDevice(CudafyModes.Target,0);
earchitecturearch=gpu.GetArchitecture();
CudafyModule km=CudafyTranslator.Cudafy(拱);
gpu.LoadModule(公里);
int[]主机_结果=新的int[N];
//或者分配一个新的内存块来保存设备上的结果
var dev_results=gpu.Allocate(N);
gpu.Set(开发结果);
//或者先用值填充数组,然后
对于(inti=0;i
ptxas信息:0字节gmem ptxas信息:编译输入函数
“sm_30”ptxas信息的“GeneratorApples”:的函数属性
发电商
0字节堆栈帧,0字节溢出存储,0字节溢出加载ptxas信息:使用5个寄存器,328字节cmem[0]
我运行了上面的代码,它简单地用num 255填充了每个int[]结果,但它确实计算得很好。好吧,所以我的索引一开始并不是那么热,我以前从未使用过四维网格索引到1D数组中。我添加了一个测试以确保索引正确
public void RunTest2()
{
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
CudafyModule km = CudafyTranslator.Cudafy();
gpu.LoadModule(km);
int size = 20 * 20;
int[] allPixels = new int[size];
int[] dev_result = gpu.Allocate<int>(size);
dim3 blocksInGrid = new dim3(5, 5);
dim3 threadsPerBlock = new dim3(4, 4);
gpu.Launch(blocksInGrid, threadsPerBlock).GenerateRipples(dev_result);
gpu.CopyFromDevice(dev_result, allPixels);
gpu.FreeAll();
}
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;
namespace FxKernelTest
{
public class FxKernTest
{
public GPGPU fxgpu;
public const int N = 1024 * 64;
public void ExeTestKernel()
{
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0);
eArchitecture arch = gpu.GetArchitecture();
CudafyModule km = CudafyTranslator.Cudafy(arch);
gpu.LoadModule(km);
int[] host_results = new int[N];
// Either assign a new block of memory to hold results on device
var dev_results = gpu.Allocate<int>(N);
gpu.Set<int>(dev_results);
// Or fill your array with values first and then
for (int i = 0; i < N; i++) host_results[i] = i * 3;
// Copy array with ints to device
//var dev_filled_results = gpu.CopyToDevice(host_results);
// 64*16 = 1024 threads per block (which is max for sm_30)
dim3 threadsPerBlock = new dim3(64, 16);
// 8*8 = 64 blocks per grid, 1024 threads per block = kernel launched 65536 times
dim3 blocksPerGrid = new dim3(8, 8);
//var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0
//var blocksPerGrid = 1; // just for show
gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results);
gpu.CopyFromDevice(dev_results, host_results);
// Test our results
for (int index = 0; index < N; index++)
if (host_results[index] != index)
throw new Exception("Check your indexing math, genius!!!");
}
[Cudafy]
public static void GenerateRipples(GThread thread, int[] results)
{
var blockSize = thread.blockDim.x * thread.blockDim.y;
var offsetToGridY = blockSize * thread.gridDim.x;
// This took me a few tries, I've never used 4 dimensions into a 1D array beofre :)
var tid = thread.blockIdx.y * offsetToGridY + // each Grid Y is 8192 in size
thread.blockIdx.x * blockSize + // each Grid X is 1024 in size
thread.threadIdx.y * thread.blockDim.x + // each Block Y is 64 in size
thread.threadIdx.x; // index into block
var threadPosInBlockX = thread.threadIdx.x;
var threadPosInBlockY = thread.threadIdx.y;
var blockPosInGridX = thread.blockIdx.x;
var blockPosInGridY = thread.blockIdx.y;
var gridSizeX = thread.gridDim.x;
var gridSizeY = thread.gridDim.y;
var blockSizeX = thread.blockDim.x;
var blockSizeY = thread.blockDim.y;
// this is your code, see how I calculate the actual thread ID above!
var threadX = blockSizeX * blockPosInGridX + threadPosInBlockX;
//if i use only one variable, everything is fine:
var threadY = blockSizeY;
// this calculates just fine
threadY = blockSizeY * blockPosInGridY + threadPosInBlockY;
// hint: use NSight for Visual Studio and look at the NSight output,
// it reports access violations and tells you where...
// if our threadId is within bounds of array size
// we cause access violation if not
// (class constants are automatically passed to kernels)
if (tid < N)
results[tid] = tid;
}
}
}