C# Cudafy内核不编译_C#_Cuda_Cudafy.net

C# Cudafy内核不编译

c# cuda

C# Cudafy内核不编译,c#,cuda,cudafy.net,C#,Cuda,Cudafy.net,使用Cudafy完成我的第一步，并尝试编写一个函数，该函数将获取其线程的位置，并在此基础上将一些int值保存到数组元素中。我的代码：因此，我无法计算这里的threadY。如果我在计算中使用多个变量，Cudafy translating类将抛出一个错误（CudafyModule cm=CudafyTranslator.Cudafy（）；抛出一个Cudafy.cudafylanaguageexception）我做错了什么更新：这是在GPU上运行内核的代码： public void RunT

使用Cudafy完成我的第一步，并尝试编写一个函数，该函数将获取其线程的位置，并在此基础上将一些int值保存到数组元素中。我的代码：

因此，我无法计算这里的

threadY

。如果我在计算中使用多个变量，Cudafy translating类将抛出一个错误（

CudafyModule cm=CudafyTranslator.Cudafy（）；

抛出一个Cudafy.cudafylanaguageexception）

我做错了什么

更新：这是在GPU上运行内核的代码：

public void RunTest2()
{
    GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
    CudafyModule km = CudafyTranslator.Cudafy();
    gpu.LoadModule(km);

    int size = 20 * 20;
    int[] allPixels = new int[size];

    int[] dev_result = gpu.Allocate<int>(size);

    dim3 blocksInGrid = new dim3(5, 5);
    dim3 threadsPerBlock = new dim3(4, 4);

    gpu.Launch(blocksInGrid, threadsPerBlock).GenerateRipples(dev_result);
    gpu.CopyFromDevice(dev_result, allPixels);

    gpu.FreeAll();
}

public void RunTest2（）
{
GPGPU gpu=CudafyHost.GetDevice（CudafyModes.Target，CudafyModes.DeviceId）；
CudafyModule km=CudafyTranslator.Cudafy（）；
gpu.LoadModule（公里）；
整数大小=20*20；
int[]所有像素=新的int[大小]；
int[]dev_result=gpu.Allocate（大小）；
dim3 blocksInGrid=新的dim3（5,5）；
dim3螺纹防松=新dim3（4,4）；
启动（blocksInGrid，threadsPerBlock）.GenerateRipples（开发结果）；
CopyFromDevice（开发结果，所有像素）；
gpu.FreeAll（）；
}

我们需要了解您是如何启动内核的，上面的代码应该可以正常运行。我创建了一个运行良好的测试类，并提供了如何准备内核网格/块/线程维度的示例。如果你想看到很好的例子下载Cudafy源代码并编译CudafyExamples项目，请查看他们是如何准备和使用Cudafy的功能的

**注意：在我发布第一个类之前，我一定抽了一些很好的东西，我忽略了验证它没有产生内存访问冲突

修正了下面的类，没有冲突

在和上查找好的示例

使用系统；
使用System.Collections.Generic；
使用系统诊断；
使用System.Linq；
使用系统文本；
使用Cudafy；
使用Cudafy.Host；
使用Cudafy.Translator；
命名空间FxKernelTest
{ 
公共类测试
{
公共GPU fxgpu；
公共常数int N=1024*64；
public void extestkernel（）
{
GPGPU gpu=CudafyHost.GetDevice（CudafyModes.Target，0）；
earchitecturearch=gpu.GetArchitecture（）；
CudafyModule km=CudafyTranslator.Cudafy（拱）；
gpu.LoadModule（公里）；
int[]主机_结果=新的int[N]；
//或者分配一个新的内存块来保存设备上的结果
var dev_results=gpu.Allocate（N）；
gpu.Set（开发结果）；
//或者先用值填充数组，然后
对于（inti=0；i


ptxas信息：0字节gmem ptxas信息：编译输入函数
“sm_30”ptxas信息的“GeneratorApples”：的函数属性
发电商
0字节堆栈帧，0字节溢出存储，0字节溢出加载ptxas信息：使用5个寄存器，328字节cmem[0]
我运行了上面的代码，它简单地用num 255填充了每个int[]结果，但它确实计算得很好。好吧，所以我的索引一开始并不是那么热，我以前从未使用过四维网格索引到1D数组中。我添加了一个测试以确保索引正确
public void RunTest2()
{
    GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
    CudafyModule km = CudafyTranslator.Cudafy();
    gpu.LoadModule(km);

    int size = 20 * 20;
    int[] allPixels = new int[size];

    int[] dev_result = gpu.Allocate<int>(size);

    dim3 blocksInGrid = new dim3(5, 5);
    dim3 threadsPerBlock = new dim3(4, 4);

    gpu.Launch(blocksInGrid, threadsPerBlock).GenerateRipples(dev_result);
    gpu.CopyFromDevice(dev_result, allPixels);

    gpu.FreeAll();
}

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;

using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;

namespace FxKernelTest 
{ 
    public class FxKernTest  
    {
        public GPGPU fxgpu;

        public const int N = 1024 * 64;

        public void ExeTestKernel()
        {
            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0);
            eArchitecture arch = gpu.GetArchitecture();
            CudafyModule km = CudafyTranslator.Cudafy(arch);

            gpu.LoadModule(km);

            int[] host_results = new int[N];

            // Either assign a new block of memory to hold results on device
            var dev_results = gpu.Allocate<int>(N);
            gpu.Set<int>(dev_results);

            // Or fill your array with values first and then
            for (int i = 0; i < N; i++) host_results[i] = i * 3;

            // Copy array with ints to device
            //var dev_filled_results = gpu.CopyToDevice(host_results);

            // 64*16 = 1024 threads per block (which is max for sm_30)
            dim3 threadsPerBlock = new dim3(64, 16);    

            // 8*8 = 64 blocks per grid, 1024 threads per block = kernel launched 65536 times
            dim3 blocksPerGrid = new dim3(8, 8); 

            //var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0
            //var blocksPerGrid = 1;      // just for show

            gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results);

            gpu.CopyFromDevice(dev_results, host_results); 

            // Test our results
            for (int index = 0; index < N; index++)
                if (host_results[index] != index)
                    throw new Exception("Check your indexing math, genius!!!");
        }

        [Cudafy]
        public static void GenerateRipples(GThread thread, int[] results)
        {
            var blockSize = thread.blockDim.x * thread.blockDim.y;

            var offsetToGridY = blockSize * thread.gridDim.x;

            // This took me a few tries, I've never used 4 dimensions into a 1D array beofre :)

            var tid = thread.blockIdx.y * offsetToGridY +       // each Grid Y is 8192 in size
                      thread.blockIdx.x * blockSize +           // each Grid X is 1024 in size
                      thread.threadIdx.y * thread.blockDim.x +  // each Block Y is 64 in size
                      thread.threadIdx.x;                       // index into block


            var threadPosInBlockX = thread.threadIdx.x;

            var threadPosInBlockY = thread.threadIdx.y;

            var blockPosInGridX = thread.blockIdx.x;

            var blockPosInGridY = thread.blockIdx.y;

            var gridSizeX = thread.gridDim.x;

            var gridSizeY = thread.gridDim.y;

            var blockSizeX = thread.blockDim.x;

            var blockSizeY = thread.blockDim.y;

            // this is your code, see how I calculate the actual thread ID above!
            var threadX = blockSizeX * blockPosInGridX + threadPosInBlockX;

            //if i use only one variable, everything is fine:
            var threadY = blockSizeY;

            // this calculates just fine
            threadY = blockSizeY * blockPosInGridY + threadPosInBlockY;

            // hint: use NSight for Visual Studio and look at the NSight output, 
            // it reports access violations and tells you where...

            // if our threadId is within bounds of array size
            // we cause access violation if not
            // (class constants are automatically passed to kernels)
            if (tid < N)
                results[tid] = tid;

        }

    }
}