使用C#、AleaGPU和设备内存时，代码相同，行为不同_C#_.net_Wpf_Visual Studio_Aleagpu

使用C#、AleaGPU和设备内存时，代码相同，行为不同

c# .net wpf visual-studio

使用C#、AleaGPU和设备内存时，代码相同，行为不同,c#,.net,wpf,visual-studio,aleagpu,C#,.net,Wpf,Visual Studio,Aleagpu,我正在使用AleaGPU库来执行矩阵乘法和类似的操作，但我似乎无法理解为什么我的代码不能按预期工作 “不按预期工作”是指结果矩阵的第一行（或前几行）具有正确的值，其余行都用0填充，代码与我在下面的其他代码示例中使用的代码相同函数#1（不起作用）：由于某种原因，此函数不起作用，并且具有上述行为。听起来我混淆了索引，但我看不出下面三个示例的代码有什么不同，而且我没有收到任何类型的错误（AleaGPU在尝试访问无效数组位置时通常会抛出异常）公共静态双精度[，]乘法（[NotNull]此双精度[，]

我正在使用AleaGPU库来执行矩阵乘法和类似的操作，但我似乎无法理解为什么我的代码不能按预期工作

“不按预期工作”是指结果矩阵的第一行（或前几行）具有正确的值，其余行都用0填充，代码与我在下面的其他代码示例中使用的代码相同

函数#1（不起作用）：由于某种原因，此函数不起作用，并且具有上述行为。听起来我混淆了索引，但我看不出下面三个示例的代码有什么不同，而且我没有收到任何类型的错误（AleaGPU在尝试访问无效数组位置时通常会抛出异常）

公共静态双精度[，]乘法（[NotNull]此双精度[，]m1[NotNull]双精度[，]m2）
{
//检查
如果（m1.GetLength（1）！=m2.GetLength（0））抛出新ArgumentOutOfRangeException（“无效矩阵大小”）；
//初始化参数和结果矩阵
inth=m1.GetLength（0）；
int w=m2.GetLength（1）；
int l=m1.GetLength（1）；
//并行执行乘法运算
使用（DeviceMemory2D m1\U设备=Gpu.Default.AllocateDevice（m1））
使用（DeviceMemory2D m2\U设备=Gpu.Default.AllocateDevice（m2））
使用（DeviceMemory2D mresult_device=Gpu.Default.AllocateDevice（h，w））
{
//指针设置
deviceptr
pm1=m1_device.Ptr，
pm2=m2_设备.Ptr，
pmresult=mresult_device.Ptr；
//局部包装函数
无效内核（intki）
{
//计算当前索引
int
i=ki/w，
j=ki%w；
//执行乘法运算
双和=0；
int im1=i*l；
对于（int k=0；k


我花了好几个小时来检查这段代码，试图检查每一行，但我真的看不出它有什么问题

这很好，但我看不出与第一个有什么区别
public static double[,] MultiplyGpuManaged([NotNull] this double[,] m1, [NotNull] double[,] m2)
{
    // Checks
    if (m1.GetLength(1) != m2.GetLength(0)) throw new ArgumentOutOfRangeException("Invalid matrices sizes");

    // Initialize the parameters and the result matrix
    int h = m1.GetLength(0);
    int w = m2.GetLength(1);
    int l = m1.GetLength(1);
    double[,]
        m1_gpu = Gpu.Default.Allocate(m1),
        m2_gpu = Gpu.Default.Allocate(m2),
        mresult_gpu = Gpu.Default.Allocate<double>(h, w);

    // Execute the multiplication in parallel
    Gpu.Default.For(0, h * w, index =>
    {
        // Calculate the current indexes
        int
            i = index / w,
            j = index % w;

        // Perform the multiplication
        double sum = 0;
        for (int k = 0; k < l; k++)
        {
            sum += m1_gpu[i, k] * m2_gpu[k, j];
        }
        mresult_gpu[i, j] = sum;
    });

    // Free memory and copy the result back
    Gpu.Free(m1_gpu);
    Gpu.Free(m2_gpu);
    double[,] result = Gpu.CopyToHost(mresult_gpu);
    Gpu.Free(mresult_gpu);
    return result;
}

public static double[，]MultiplyGpuManaged（[NotNull]此double[，]m1[NotNull]double[，]m2）
{
//检查
如果（m1.GetLength（1）！=m2.GetLength（0））抛出新ArgumentOutOfRangeException（“无效矩阵大小”）；
//初始化参数和结果矩阵
inth=m1.GetLength（0）；
int w=m2.GetLength（1）；
int l=m1.GetLength（1）；
双[，]
m1gpu=gpu.Default.Allocate（m1），
m2\gpu=gpu.Default.Allocate（m2），
mresult_gpu=gpu.Default.Allocate（h，w）；
//并行执行乘法运算
（0，h*w，索引=>
{
//计算当前索引
int
i=指数/w，
j=指数%w；
//执行乘法运算
双和=0；
对于（int k=0；k

这也很好，我做了这个额外的测试来检查我是否在第一个函数中弄乱了索引（显然它们很好）
public static double[，]MultiplyOnCPU（[NotNull]此double[，]m1[NotNull]double[，]m2）
{
//检查
如果（m1.GetLength（1）！=m2.GetLength（0））抛出新ArgumentOutOfRangeException（“无效矩阵大小”）；
//初始化参数和结果矩阵
inth=m1.GetLength（0）；
int w=m2.GetLength（1）；
int l=m1.GetLength（1）；
双精度[，]结果=新的双精度[h，w]；
对于（0，h*w，索引=>
{
不安全的
{
固定（双*压力=结果，pm1=m1，pm2=m2）
{
//计算当前索引
int
i=指数/w，
j=指数%w；
//执行乘法运算
双和=0；
int im1=i*l；
对于（int k=0；k

我真的不明白我在第一种方法中遗漏了什么，我也不明白为什么它不起作用
提前感谢您的帮助
 事实证明，这个问题是由gpu分配2D数组的方法引起的——它不是像标准.NET数组那样使用单个连续内存块，而是出于性能原因在每行末尾添加了一些填充
寻址2D gpu阵列的正确方法是使用间距，它指示每行（列+填充）的有效宽度
下面是一个工作代码示例，它仅填充2D gpu阵列并将其复制回主机：
const int size = 10;
double[,] matrix_gpu;
using (DeviceMemory2D<double> m_gpu = Gpu.Default.AllocateDevice<double>(size, size))
{
    deviceptr<double> ptr = m_gpu.Ptr;
    int pitch = m_gpu.PitchInElements.ToInt32();
    Gpu.Default.For(0, size, i =>
    {
        for (int j = 0; j < size; j++)
        {
            ptr[i * pitch + j] = i * size + j;
        }
    });
    matrix_gpu = Gpu.Copy2DToHost(m_gpu);
}

const int size=10；
双[，]矩阵gpu；
使用（DeviceMemory2D MU gpu=gpu.Default.AllocateDevice（大小，大小））
{
deviceptr ptr=m_gpu.ptr；
int pitch=MGPU.PitchInElements.ToInt32（）；
对于（0，大小，i=>
{
对于（int j=0；j

public static double[,] MultiplyOnCPU([NotNull] this double[,] m1, [NotNull] double[,] m2)
{
    // Checks
    if (m1.GetLength(1) != m2.GetLength(0)) throw new ArgumentOutOfRangeException("Invalid matrices sizes");

    // Initialize the parameters and the result matrix
    int h = m1.GetLength(0);
    int w = m2.GetLength(1);
    int l = m1.GetLength(1);
    double[,] result = new double[h, w];
    Parallel.For(0, h * w, index =>
    {
        unsafe
        {
            fixed (double* presult = result, pm1 = m1, pm2 = m2)
            {
                // Calculate the current indexes
                int
                    i = index / w,
                    j = index % w;

                // Perform the multiplication
                double sum = 0;
                int im1 = i * l;
                for (int k = 0; k < l; k++)
                {
                    sum += pm1[im1 + k] * pm2[k * w + j];
                }
                presult[i * w + j] = sum;
            }
        }
    });
    return result;
}

const int size = 10;
double[,] matrix_gpu;
using (DeviceMemory2D<double> m_gpu = Gpu.Default.AllocateDevice<double>(size, size))
{
    deviceptr<double> ptr = m_gpu.Ptr;
    int pitch = m_gpu.PitchInElements.ToInt32();
    Gpu.Default.For(0, size, i =>
    {
        for (int j = 0; j < size; j++)
        {
            ptr[i * pitch + j] = i * size + j;
        }
    });
    matrix_gpu = Gpu.Copy2DToHost(m_gpu);
}