jCUDA中的函数cuMemcpyHtoD出错_Cuda_Jcuda

jCUDA中的函数cuMemcpyHtoD出错

cuda

jCUDA中的函数cuMemcpyHtoD出错,cuda,jcuda,Cuda,Jcuda,我是java编程新手，尝试用jCUDA编写矩阵乘法程序在将数据从主机传输到设备（反之亦然）时，我使用： cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT); cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT); cuMemcpyDtoH(Pointer.to(hos

我是java编程新手，尝试用jCUDA编写矩阵乘法程序

在将数据从主机传输到设备（反之亦然）时，我使用：

cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);

这里，devMatrix、devMatrix和devMatrix是要存储在设备内存中的矩阵。hostMatrixA、hostMatrixB和hostMatrixC是存储在主机内存中的矩阵

当我调用上述函数进行数据传输时，它会给我以下错误：“类型指针中的方法to（字节[]）不适用于参数（float[]）”，带有“to”的指针。to（”带红色下划线。我正在使用eclipse。我给出了如下完整代码

请原谅我的java知识，如果我走错了方向，请提出建议

Package JCudaMatrixAddition;
import static jcuda.driver.JCudaDriver.*;

import java.io.*;

import jcuda.*;
import jcuda.driver.*;
import jcuda.Pointer;
import jcuda.Sizeof;


public class JCudaMatrixAddition {
    public static void main(String[] args) throws IOException 
    {
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);

        // Create the PTX file by calling the NVCC
        String ptxFilename = preparePtxFile("JCudaMatrixAdditionKernel.cu");

        //Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet (device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        //Load PTX file
        CUmodule module = new CUmodule();
        cuModuleLoad(module,ptxFilename);

        //Obtain a function pointer to the Add function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "add");

        int numRows = 32;
        int numCols = 32;

        //Allocate and fill Host input Matrices:
        float hostMatrixA[][] = new float[numRows][numCols];
        float hostMatrixB[][] = new float[numRows][numCols];
        float hostMatrixC[][] = new float[numRows][numCols];


        for(int i = 0; i<numRows; i++)

        {
            for(int j = 0; j<numCols; j++)
            {
                hostMatrixA[i][j] = (float) 1.0;
                hostMatrixB[i][j] = (float) 1.0;
            }
        }
        // Allocate the device input data, and copy the
        // host input data to the device
        CUdeviceptr devMatrixA = new CUdeviceptr();
        cuMemAlloc(devMatrixA, numRows * numCols * Sizeof.FLOAT);

        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);

        CUdeviceptr devMatrixB = new CUdeviceptr();
        cuMemAlloc(devMatrixB, numRows * numCols * Sizeof.FLOAT);

        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);

        //Allocate device matrix C to store output
        CUdeviceptr devMatrixC = new CUdeviceptr();
        cuMemAlloc(devMatrixC, numRows * numCols * Sizeof.FLOAT);

        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.

        Pointer kernelParameters = Pointer.to(Pointer.to(new int[]{numRows}),
                                   Pointer.to(new int[]{numRows}), 
                                   Pointer.to(devMatrixA),
                                   Pointer.to(devMatrixB),
                                   Pointer.to(devMatrixC));

        //Kernel thread configuration
        int blockSize = 32;
        int gridSize = 1;

        cuLaunchKernel(function, 
                       gridSize, 1, 1,
                       blockSize, 32, 1,
                       0, null, kernelParameters, null);

        cuCtxSynchronize();
        // Allocate host output memory and copy the device output
        // to the host.

        //This is the part where it gives me the error
        cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);

        //verify the result
        for (int i =0; i<numRows; i++)
        {
            for (int j =0; j<numRows; j++)
            {
                System.out.print("   "+ hostMatrixB[i][j]);
            }
            System.out.println("");
        }
        cuMemFree(devMatrixA);
        cuMemFree(devMatrixB);
        cuMemFree(devMatrixC);

    }

包JCudaMatrixAddition；
导入静态jcuda.driver.JCudaDriver.*；
导入java.io.*；
进口jcuda。*；
导入jcuda.driver.*；
导入jcuda.Pointer；
进口jcuda.Sizeof；
公共类JCUDAMATRIXADDION{
公共静态void main（字符串[]args）引发IOException
{
//启用异常并忽略所有后续错误检查
JCudaDriver.setExceptionsEnabled（true）；
//通过调用NVCC创建PTX文件
字符串ptxFilename=preparePtxFile（“JCudaMatrixAdditionKernel.cu”）；
//初始化驱动程序并为第一个设备创建上下文。
cuInit（0）；
CUdevice device=新CUdevice（）；
cuDeviceGet（设备，0）；
CUcontext context=新的CUcontext（）；
cuCtxCreate（上下文，0，设备）；
//加载PTX文件
积云模块=新积云模块（）；
cuModuleLoad（模块，ptxFilename）；
//获取指向Add函数的函数指针
CUfunction=新的CUfunction（）；
cuModuleGetFunction（函数，模块，“添加”）；
int numRows=32；
int numCols=32；
//分配并填写主机输入矩阵：
float hostMatrixA[][]=new float[numRows][numCols]；
float hostMatrixB[][]=新的float[numRows][numCols]；
float hostMatrixC[][]=新的float[numRows][numCols]；
对于（int i=0；i您不能将float[][]
数组从主机直接复制到设备
当您创建一个float[][]
数组时，这不是一个大型的float
值数组。相反，它是一个数组数组。想象一下，您甚至可以创建一个类似
float array[][] = new float[3];
array[0] = new float[42];
array[1] = null;
array[2] = new float[1234];

这根本不是一个连续的内存块，因此，这样的数组不能复制到设备上
在CUDA中处理矩阵时（不仅在JCuda中，而且通常在CUDA中），它们通常表示为一维数组
float hostMatrixA[] = new float[numRows*numCols];

为了访问矩阵元素，必须计算适当的索引：
int row = ...;
int col = ...;
hostMatrix[col+row*numCols] = 123.0f; // Column-major

// Or
hostMatrix[row+col*numRows] = 123.0f; // Row-major

最后两行的区别在于，一行采用列主顺序，另一行采用行主顺序。有关详细信息，请参阅
一些旁注：
CUDA矩阵库（如CUBLAS）使用列主顺序，因此遵循相同的约定可能是一个好主意。尤其是当您以后想要使用CUBLAS/JCublas函数时。例如，该函数已经提供了执行矩阵加法的功能
当您只想进行矩阵加法时，在使用CUDA/JCuda时不会看到加速
顺便说一句：从技术上讲，使用“2D数组”是可能的。图中展示了如何做到这一点。但这相当不方便，不推荐用于矩阵运算