cuda-directx 12纹理2D(在1D阵列中)互操作

cuda-directx 12纹理2D(在1D阵列中)互操作,cuda,directx-12,Cuda,Directx 12,我正在尝试在cuda中更新directx12中使用的纹理。我可能会错过一些东西,但我没有关于它的提示 在图像的右上角区域有一个“始终黑色”区域 只有当所有像素的rgb值都相同时,我才能得到预期的结果(对第一个问题进行模化),否则我会得到意外的伪影,就像数组没有预期的结构一样 我错过了什么 以下是纹理的创建过程: { TextureWidth = m_width; TextureHeight = m_height; auto nPixels = TextureWidth *

我正在尝试在cuda中更新directx12中使用的纹理。我可能会错过一些东西,但我没有关于它的提示

  • 在图像的右上角区域有一个“始终黑色”区域
  • 只有当所有像素的rgb值都相同时,我才能得到预期的结果(对第一个问题进行模化),否则我会得到意外的伪影,就像数组没有预期的结构一样
  • 我错过了什么

    以下是纹理的创建过程:

    {
        TextureWidth = m_width;
        TextureHeight = m_height;
        auto nPixels = TextureWidth * TextureHeight * 3;
        auto pixelBufferSize = sizeof(float)* nPixels;
    
        D3D12_RESOURCE_DESC textureDesc{};
        textureDesc.MipLevels = 1;
        textureDesc.Format = DXGI_FORMAT_R32G32B32_FLOAT;
        textureDesc.Width = TextureWidth;
        textureDesc.Height = TextureHeight;
        textureDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
        textureDesc.DepthOrArraySize = 1;
        textureDesc.SampleDesc.Count = 1;
        textureDesc.SampleDesc.Quality = 0;
        textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
    
        ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
            &textureDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&m_textureBuffer)));
        NAME_D3D12_OBJECT(m_textureBuffer);
    
        // Describe and create a SRV for the texture.
        {
            D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc{};
            srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
            srvDesc.Format = textureDesc.Format;
            srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
            srvDesc.Texture2D.MipLevels = 1;
            m_device->CreateShaderResourceView(m_textureBuffer.Get(), &srvDesc, m_srvHeap->GetCPUDescriptorHandleForHeapStart());
            NAME_D3D12_OBJECT(m_srvHeap);
        }
    
        // Share m_textureBuffer with cuda
        {
            HANDLE sharedHandle{};
            WindowsSecurityAttributes windowsSecurityAttributes{};
            LPCWSTR name{};
            ThrowIfFailed(m_device->CreateSharedHandle(m_textureBuffer.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle));
    
            D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
            d3d12ResourceAllocationInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &CD3DX12_RESOURCE_DESC::Buffer(pixelBufferSize));
            auto actualSize = d3d12ResourceAllocationInfo.SizeInBytes;
    
            cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
            memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
            externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
            externalMemoryHandleDesc.size = actualSize;
            externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
    
            checkCudaErrors(cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));
    
            cudaExternalMemoryBufferDesc externalMemoryBufferDesc;
            memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc));
            externalMemoryBufferDesc.offset = 0;
            externalMemoryBufferDesc.size = pixelBufferSize;
            externalMemoryBufferDesc.flags = 0;
    
            checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&m_cudaDevVertptr, m_externalMemory, &externalMemoryBufferDesc));
            RunKernel(TextureWidth, TextureHeight, (float*)m_cudaDevVertptr, m_streamToRun, 1.0f);
            checkCudaErrors(cudaStreamSynchronize(m_streamToRun));
        }
    }
    
    这里是更新此纹理的cuda代码:

    int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
    
    __global__ void TextureKernel(float *pixels, unsigned int width, unsigned int height, float time)
    {
        unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
        unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    
        if (y < height && x < width)
        {
            auto pos = (y * width + x) * 3;
            auto sint = __sinf(time) * 0.1f + 0.10f;
            auto sintAlt = (x / 32) % 2 == 0 ? 1.0f : sint;
            pixels[pos + 0] = sintAlt; //RED
            pixels[pos + 1] = 0; // (x + y) % 2 == 0 ? 1.0f : __sinf(time) * 0.25f + 0.75f; //GREEN
            pixels[pos + 2] = 0; // (x + y) % 2 == 0 ? 1.0f : 0.0f;                       //BLUE
            //pixels[pos + 0] = __sinf(time + 0.) * 0.5f + 0.5f;
            //pixels[pos + 1] = __sinf(time * 0.09) * 0.5f + 0.5f;
            //pixels[pos + 2] = __sinf(time + 2) * 0.5f + 0.5f;
        }
    }
    
    void RunKernel(size_t meshWidth, size_t meshHeight, float *texture_dev, cudaStream_t streamToRun, float animTime)
    {
        //dim3 block(16, 16, 1);
        //dim3 grid(meshWidth / 16, meshHeight / 16, 1);
        auto unit = 32;
        dim3 threads(unit, unit);
        dim3 grid(iDivUp(meshWidth, unit), iDivUp(meshHeight, unit));
        TextureKernel <<<grid, threads, 0, streamToRun >>>(texture_dev, meshWidth, meshHeight, animTime);
        getLastCudaError("TextureKernel execution failed.\n");
    }
    

    假设具有三个float类型通道的2D纹理图像将具有简单的行线性内存布局。正如您的结果所表明的,这通常是不正确的

    纹理经过优化以实现空间一致性访问。他们的内存布局设计为使在n维纹理空间中接近的东西在内存中保持接近。通过简单的行主内存布局,任何具有多个维度的对象都无法实现这一点。特定纹理图像的确切内存布局通常不是您可以假定知道或依赖的。它将取决于您使用的GPU(通常,数据将以某种方式存储,使用诸如平铺或填充的方式来保持内容对齐)


    正如您自己所注意到的,您要做的是将CUDA数组(数组是CUDA类比纹理图像)映射到来自D3D12的外部数据。此CUDA数组的格式必须与D3D12中创建的纹理的格式匹配。然后,您应该能够使用CUDA运行时API的纹理或曲面函数来访问此CUDA数组表示的纹理图像…

    您假设具有三个float类型通道的2D纹理图像将具有简单的行线性内存布局。正如您的结果所表明的,这通常是不正确的

    纹理经过优化以实现空间一致性访问。他们的内存布局设计为使在n维纹理空间中接近的东西在内存中保持接近。通过简单的行主内存布局,任何具有多个维度的对象都无法实现这一点。特定纹理图像的确切内存布局通常不是您可以假定知道或依赖的。它将取决于您使用的GPU(通常,数据将以某种方式存储,使用诸如平铺或填充的方式来保持内容对齐)


    正如您自己所注意到的,您要做的是将CUDA数组(数组是CUDA类比纹理图像)映射到来自D3D12的外部数据。此CUDA数组的格式必须与D3D12中创建的纹理的格式匹配。然后,您应该能够使用CUDA运行时API的纹理或曲面函数访问此CUDA数组表示的纹理图像…

    正确的做法是将纹理作为外部内存导入,然后作为mipmap数组导入,然后使用此数组创建CUDA曲面,然后在CUDA内核中修改此曲面

    导入和映射是通过以下方式完成的:

    cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
    cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
    cuExtmemMipDesc.numLevels = 1;
    cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore;
    
    cudaMipmappedArray_t cuMipArray{};
    CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
    
    cudaArray_t cuArray{};
    CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
    
    cudaResourceDesc cuResDesc{};
    cuResDesc.resType = cudaResourceTypeArray;
    cuResDesc.res.array.array = cuArray;
    checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
    // where cudaSurfaceObject_t cuSurface{};
    
    cudaexternalemorymipmpappedarraydesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent=make_cudaExtent(texDesc.Width,texDesc.Height,0);
    cuExtmemMipDesc.formatDesc=cudaCreateChannelDesc();
    cuExtmemMipDesc.numLevels=1;
    cuExtmemMipDesc.flags=cudaArraySurfaceLoadStore;
    cudaMipmappedArray_u t cuMipArray{};
    检查CUDAERRORS(CUDAEXTERNALMEMORYGETMAPPEDMIPPEDRARRAY(&cuMipArray,m_externalMemory,&CUEXTMEMIPDESC));
    cudaArray_t cuArray{};
    检查CUDAERRORS(CUDAGETMIPMAPPEDRARALLEVEL(&cuArray,cuMipArray,0));
    cudaResourceDesc cuResDesc{};
    cuResDesc.resType=cudaResourceTypeArray;
    cuResDesc.res.array.array=cuArray;
    检查CUDAERRORS(cudaCreateSurfaceObject(&cuSurface,&cuResDesc));
    //其中cudaSurfaceObject_u t cussurface{};
    
    cuda部分如下所示:

    int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
    
    __global__ void UpdateSurface(cudaSurfaceObject_t surf, unsigned int width, unsigned int height, float time)
    {
        unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
        if (y >= height | x >= width) return;
    
        auto xVar = (float)x / (float)width;
        auto yVar = (float)y / (float)height;
        auto cost = __cosf(time) * 0.5f + 0.5f;
        auto costx = __cosf(time) * 0.5f + xVar;
        auto costy = __cosf(time) * 0.5f + yVar;
        auto costxx = (__cosf(time) * 0.5f + 0.5f) * width;
        auto costyy = (__cosf(time) * 0.5f + 0.5f) * height;
        auto costxMany = __cosf(y * time) * 0.5f + yVar;
        auto costyMany = __cosf((float)x/100 * time) * 0.5f + xVar;
        auto margin = 1;
    
        float4 pixel{};
        if (y == 0) // paint the first row
            pixel = make_float4(costyMany * 0.3, costyMany * 1, costyMany * 0.4, 1);
        else if (y == height - 1) // paint the last row
            pixel = make_float4(costyMany * 0.6, costyMany * 0.7, costyMany * 1, 1);
        else if (x % 5 == 0) // paint a column of 1 pixel wide every 5 pixels
        {
            if (x > width / 2) // a certain color for the right half
                pixel = make_float4(0.1, 0.5, costx * 1, 1);
            else // another color for the left half
                pixel = make_float4(costx * 1, 0.1, 0.2, 1);
        }
        else if (x > width - margin - 1 | x <= margin) // first and last columns
            pixel = make_float4(costxMany, costxMany * 0.9, costxMany * 0.6, 1);
        else // all the rest of the texture
            pixel = make_float4(costx * 0.3, costx * 0.4, costx * 0.6, 1);
        surf2Dwrite(pixel, surf, x * 16, y);
    }
    
    void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surfaceObject, cudaStream_t streamToRun, float animTime)
    {
        auto unit = 10;
        dim3 threads(unit, unit);
        dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
        UpdateSurface <<<grid, threads, 0, streamToRun >>> (surfaceObject, textureW, textureH, animTime);
        getLastCudaError("UpdateSurface execution failed.\n");
    }
    
    intidivup(inta,intb){返回a%b!=0?a/b+1:a/b;}
    __全局无效更新曲面(CUDASURFACHEOBJECT\t surf、无符号整数宽度、无符号整数高度、浮点时间)
    {
    无符号整数x=blockIdx.x*blockDim.x+threadIdx.x;
    无符号整数y=blockIdx.y*blockDim.y+threadIdx.y;
    如果(y>=高度| x>=宽度)返回;
    自动xVar=(浮动)x/(浮动)宽度;
    自动yVar=(浮动)y/(浮动)高度;
    自动成本=u cosf(时间)*0.5f+0.5f;
    自动成本x=u cosf(时间)*0.5f+xVar;
    自动成本=u cosf(时间)*0.5f+yVar;
    自动成本XX=(uu cosf(时间)*0.5f+0.5f)*宽度;
    自动成本Y=(uu cosf(时间)*0.5f+0.5f)*高度;
    自动成本xMany=uu cosf(y*时间)*0.5f+yVar;
    自动成本数量=u cosf((浮动)x/100*时间)*0.5f+xVar;
    自动边距=1;
    浮动4像素{};
    如果(y==0)//绘制第一行
    像素=make_float4(costyMany*0.3,costyMany*1,costyMany*0.4,1);
    else如果(y==高度-1)//绘制最后一行
    像素=make_float4(costyMany*0.6,costyMany*0.7,costyMany*1,1);
    else if(x%5==0)//每5个像素绘制一个1像素宽的列
    {
    if(x>width/2)//右半部分的特定颜色
    像素=make_float4(0.1,0.5,costx*1,1);
    else//左半部分的另一种颜色
    像素=make_float4(costx*1,0.1,0.2,1);
    }
    
    否则,如果(x>width-margin-1 | x正确的做法是将纹理作为外部内存导入,然后作为mipmap数组,然后使用此数组创建cuda曲面,然后在cuda内核中修改此曲面

    导入和映射是通过以下方式完成的:

    cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
    cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
    cuExtmemMipDesc.numLevels = 1;
    cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore;
    
    cudaMipmappedArray_t cuMipArray{};
    CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
    
    cudaArray_t cuArray{};
    CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
    
    cudaResourceDesc cuResDesc{};
    cuResDesc.resType = cudaResourceTypeArray;
    cuResDesc.res.array.array = cuArray;
    checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
    // where cudaSurfaceObject_t cuSurface{};
    
    cudaexternalemorymipmpappedarraydesc cuExtmemMipDesc{};
    cuExtmemMipDesc.extent=make_cudaExtent(texDesc.Width,texDesc.Height,0);
    cuExtmemMipDesc.formatDesc=cudaCreateChannelDesc();
    cuExtmemMipDesc.numLevels=1;
    cuExtmemMipDesc.flags=cudaArraySurfaceLoadStore;
    cudaMipmappedArray_u t cuMipArray{};
    检查CUDAERRORS(CUDAEXTERNALMEMORYGETMAPPEDMIPPEDRARRAY(&cuMipArray,m_外部内存,