cuda-directx 12纹理2D（在1D阵列中）互操作_Cuda_Directx 12

cuda-directx 12纹理2D（在1D阵列中）互操作

cuda

cuda-directx 12纹理2D（在1D阵列中）互操作,cuda,directx-12,Cuda,Directx 12,我正在尝试在cuda中更新directx12中使用的纹理。我可能会错过一些东西，但我没有关于它的提示在图像的右上角区域有一个“始终黑色”区域只有当所有像素的rgb值都相同时，我才能得到预期的结果（对第一个问题进行模化），否则我会得到意外的伪影，就像数组没有预期的结构一样我错过了什么以下是纹理的创建过程： { TextureWidth = m_width; TextureHeight = m_height; auto nPixels = TextureWidth *

我正在尝试在cuda中更新directx12中使用的纹理。我可能会错过一些东西，但我没有关于它的提示

在图像的右上角区域有一个“始终黑色”区域

只有当所有像素的rgb值都相同时，我才能得到预期的结果（对第一个问题进行模化），否则我会得到意外的伪影，就像数组没有预期的结构一样

我错过了什么

以下是纹理的创建过程：

{
    TextureWidth = m_width;
    TextureHeight = m_height;
    auto nPixels = TextureWidth * TextureHeight * 3;
    auto pixelBufferSize = sizeof(float)* nPixels;

    D3D12_RESOURCE_DESC textureDesc{};
    textureDesc.MipLevels = 1;
    textureDesc.Format = DXGI_FORMAT_R32G32B32_FLOAT;
    textureDesc.Width = TextureWidth;
    textureDesc.Height = TextureHeight;
    textureDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
    textureDesc.DepthOrArraySize = 1;
    textureDesc.SampleDesc.Count = 1;
    textureDesc.SampleDesc.Quality = 0;
    textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;

    ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
        &textureDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&m_textureBuffer)));
    NAME_D3D12_OBJECT(m_textureBuffer);

    // Describe and create a SRV for the texture.
    {
        D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc{};
        srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
        srvDesc.Format = textureDesc.Format;
        srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
        srvDesc.Texture2D.MipLevels = 1;
        m_device->CreateShaderResourceView(m_textureBuffer.Get(), &srvDesc, m_srvHeap->GetCPUDescriptorHandleForHeapStart());
        NAME_D3D12_OBJECT(m_srvHeap);
    }

    // Share m_textureBuffer with cuda
    {
        HANDLE sharedHandle{};
        WindowsSecurityAttributes windowsSecurityAttributes{};
        LPCWSTR name{};
        ThrowIfFailed(m_device->CreateSharedHandle(m_textureBuffer.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle));

        D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
        d3d12ResourceAllocationInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &CD3DX12_RESOURCE_DESC::Buffer(pixelBufferSize));
        auto actualSize = d3d12ResourceAllocationInfo.SizeInBytes;

        cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
        memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
        externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
        externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
        externalMemoryHandleDesc.size = actualSize;
        externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;

        checkCudaErrors(cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));

        cudaExternalMemoryBufferDesc externalMemoryBufferDesc;
        memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc));
        externalMemoryBufferDesc.offset = 0;
        externalMemoryBufferDesc.size = pixelBufferSize;
        externalMemoryBufferDesc.flags = 0;

        checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&m_cudaDevVertptr, m_externalMemory, &externalMemoryBufferDesc));
        RunKernel(TextureWidth, TextureHeight, (float*)m_cudaDevVertptr, m_streamToRun, 1.0f);
        checkCudaErrors(cudaStreamSynchronize(m_streamToRun));
    }
}

这里是更新此纹理的cuda代码：

int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }

__global__ void TextureKernel(float *pixels, unsigned int width, unsigned int height, float time)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (y < height && x < width)
    {
        auto pos = (y * width + x) * 3;
        auto sint = __sinf(time) * 0.1f + 0.10f;
        auto sintAlt = (x / 32) % 2 == 0 ? 1.0f : sint;
        pixels[pos + 0] = sintAlt; //RED
        pixels[pos + 1] = 0; // (x + y) % 2 == 0 ? 1.0f : __sinf(time) * 0.25f + 0.75f; //GREEN
        pixels[pos + 2] = 0; // (x + y) % 2 == 0 ? 1.0f : 0.0f;                       //BLUE
        //pixels[pos + 0] = __sinf(time + 0.) * 0.5f + 0.5f;
        //pixels[pos + 1] = __sinf(time * 0.09) * 0.5f + 0.5f;
        //pixels[pos + 2] = __sinf(time + 2) * 0.5f + 0.5f;
    }
}

void RunKernel(size_t meshWidth, size_t meshHeight, float *texture_dev, cudaStream_t streamToRun, float animTime)
{
    //dim3 block(16, 16, 1);
    //dim3 grid(meshWidth / 16, meshHeight / 16, 1);
    auto unit = 32;
    dim3 threads(unit, unit);
    dim3 grid(iDivUp(meshWidth, unit), iDivUp(meshHeight, unit));
    TextureKernel <<<grid, threads, 0, streamToRun >>>(texture_dev, meshWidth, meshHeight, animTime);
    getLastCudaError("TextureKernel execution failed.\n");
}

假设具有三个float类型通道的2D纹理图像将具有简单的行线性内存布局。正如您的结果所表明的，这通常是不正确的

纹理经过优化以实现空间一致性访问。他们的内存布局设计为使在n维纹理空间中接近的东西在内存中保持接近。通过简单的行主内存布局，任何具有多个维度的对象都无法实现这一点。特定纹理图像的确切内存布局通常不是您可以假定知道或依赖的。它将取决于您使用的GPU（通常，数据将以某种方式存储，使用诸如平铺或填充的方式来保持内容对齐）

正如您自己所注意到的，您要做的是将CUDA数组（数组是CUDA类比纹理图像）映射到来自D3D12的外部数据。此CUDA数组的格式必须与D3D12中创建的纹理的格式匹配。然后，您应该能够使用CUDA运行时API的纹理或曲面函数来访问此CUDA数组表示的纹理图像…

您假设具有三个float类型通道的2D纹理图像将具有简单的行线性内存布局。正如您的结果所表明的，这通常是不正确的

正如您自己所注意到的，您要做的是将CUDA数组（数组是CUDA类比纹理图像）映射到来自D3D12的外部数据。此CUDA数组的格式必须与D3D12中创建的纹理的格式匹配。然后，您应该能够使用CUDA运行时API的纹理或曲面函数访问此CUDA数组表示的纹理图像…

正确的做法是将纹理作为外部内存导入，然后作为mipmap数组导入，然后使用此数组创建CUDA曲面，然后在CUDA内核中修改此曲面

导入和映射是通过以下方式完成的：

cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore;

cudaMipmappedArray_t cuMipArray{};
CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));

cudaArray_t cuArray{};
CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));

cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
// where cudaSurfaceObject_t cuSurface{};

cudaexternalemorymipmpappedarraydesc cuExtmemMipDesc{}；
cuExtmemMipDesc.extent=make_cudaExtent（texDesc.Width，texDesc.Height，0）；
cuExtmemMipDesc.formatDesc=cudaCreateChannelDesc（）；
cuExtmemMipDesc.numLevels=1；
cuExtmemMipDesc.flags=cudaArraySurfaceLoadStore；
cudaMipmappedArray_u t cuMipArray{}；
检查CUDAERRORS（CUDAEXTERNALMEMORYGETMAPPEDMIPPEDRARRAY（&cuMipArray，m_externalMemory，&CUEXTMEMIPDESC））；
cudaArray_t cuArray{}；
检查CUDAERRORS（CUDAGETMIPMAPPEDRARALLEVEL（&cuArray，cuMipArray，0））；
cudaResourceDesc cuResDesc{}；
cuResDesc.resType=cudaResourceTypeArray；
cuResDesc.res.array.array=cuArray；
检查CUDAERRORS（cudaCreateSurfaceObject（&cuSurface，&cuResDesc））；
//其中cudaSurfaceObject_u t cussurface{}；

cuda部分如下所示：

int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }

__global__ void UpdateSurface(cudaSurfaceObject_t surf, unsigned int width, unsigned int height, float time)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (y >= height | x >= width) return;

    auto xVar = (float)x / (float)width;
    auto yVar = (float)y / (float)height;
    auto cost = __cosf(time) * 0.5f + 0.5f;
    auto costx = __cosf(time) * 0.5f + xVar;
    auto costy = __cosf(time) * 0.5f + yVar;
    auto costxx = (__cosf(time) * 0.5f + 0.5f) * width;
    auto costyy = (__cosf(time) * 0.5f + 0.5f) * height;
    auto costxMany = __cosf(y * time) * 0.5f + yVar;
    auto costyMany = __cosf((float)x/100 * time) * 0.5f + xVar;
    auto margin = 1;

    float4 pixel{};
    if (y == 0) // paint the first row
        pixel = make_float4(costyMany * 0.3, costyMany * 1, costyMany * 0.4, 1);
    else if (y == height - 1) // paint the last row
        pixel = make_float4(costyMany * 0.6, costyMany * 0.7, costyMany * 1, 1);
    else if (x % 5 == 0) // paint a column of 1 pixel wide every 5 pixels
    {
        if (x > width / 2) // a certain color for the right half
            pixel = make_float4(0.1, 0.5, costx * 1, 1);
        else // another color for the left half
            pixel = make_float4(costx * 1, 0.1, 0.2, 1);
    }
    else if (x > width - margin - 1 | x <= margin) // first and last columns
        pixel = make_float4(costxMany, costxMany * 0.9, costxMany * 0.6, 1);
    else // all the rest of the texture
        pixel = make_float4(costx * 0.3, costx * 0.4, costx * 0.6, 1);
    surf2Dwrite(pixel, surf, x * 16, y);
}

void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surfaceObject, cudaStream_t streamToRun, float animTime)
{
    auto unit = 10;
    dim3 threads(unit, unit);
    dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
    UpdateSurface <<<grid, threads, 0, streamToRun >>> (surfaceObject, textureW, textureH, animTime);
    getLastCudaError("UpdateSurface execution failed.\n");
}

intidivup（inta，intb）{返回a%b！=0？a/b+1:a/b；}
__全局无效更新曲面（CUDASURFACHEOBJECT\t surf、无符号整数宽度、无符号整数高度、浮点时间）
{
无符号整数x=blockIdx.x*blockDim.x+threadIdx.x；
无符号整数y=blockIdx.y*blockDim.y+threadIdx.y；
如果（y>=高度| x>=宽度）返回；
自动xVar=（浮动）x/（浮动）宽度；
自动yVar=（浮动）y/（浮动）高度；
自动成本=u cosf（时间）*0.5f+0.5f；
自动成本x=u cosf（时间）*0.5f+xVar；
自动成本=u cosf（时间）*0.5f+yVar；
自动成本XX=（uu cosf（时间）*0.5f+0.5f）*宽度；
自动成本Y=（uu cosf（时间）*0.5f+0.5f）*高度；
自动成本xMany=uu cosf（y*时间）*0.5f+yVar；
自动成本数量=u cosf（（浮动）x/100*时间）*0.5f+xVar；
自动边距=1；
浮动4像素{}；
如果（y==0）//绘制第一行
像素=make_float4（costyMany*0.3，costyMany*1，costyMany*0.4,1）；
else如果（y==高度-1）//绘制最后一行
像素=make_float4（costyMany*0.6，costyMany*0.7，costyMany*1,1）；
else if（x%5==0）//每5个像素绘制一个1像素宽的列
{
if（x>width/2）//右半部分的特定颜色
像素=make_float4（0.1,0.5，costx*1,1）；
else//左半部分的另一种颜色
像素=make_float4（costx*1,0.1,0.2,1）；
}
否则，如果（x>width-margin-1 | x正确的做法是将纹理作为外部内存导入，然后作为mipmap数组，然后使用此数组创建cuda曲面，然后在cuda内核中修改此曲面
导入和映射是通过以下方式完成的：
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore;

cudaMipmappedArray_t cuMipArray{};
CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));

cudaArray_t cuArray{};
CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));

cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
// where cudaSurfaceObject_t cuSurface{};

cudaexternalemorymipmpappedarraydesc cuExtmemMipDesc{}；
cuExtmemMipDesc.extent=make_cudaExtent（texDesc.Width，texDesc.Height，0）；
cuExtmemMipDesc.formatDesc=cudaCreateChannelDesc（）；
cuExtmemMipDesc.numLevels=1；
cuExtmemMipDesc.flags=cudaArraySurfaceLoadStore；
cudaMipmappedArray_u t cuMipArray{}；
检查CUDAERRORS（CUDAEXTERNALMEMORYGETMAPPEDMIPPEDRARRAY（&cuMipArray，m_外部内存，