Cuda 从未对齐的uint8读取作为uint32数组重铸的数据-未获取所有值_Cuda_Alignment_Memory Alignment

Cuda 从未对齐的uint8读取作为uint32数组重铸的数据-未获取所有值

cuda

Cuda 从未对齐的uint8读取作为uint32数组重铸的数据-未获取所有值,cuda,alignment,memory-alignment,Cuda,Alignment,Memory Alignment,我正在尝试将uint8\u t数组转换为uint32\u t数组。然而，当我尝试这样做时，我似乎无法访问每一个连续的4字节假设我有一个8字节的uint8_t数组。我想作为一个uint32\t访问字节2->6 它们都得到相同的值*（（uint32_t*）和uint8Array[0]），*（（uint32_t*）和uint8Array[1]），*（（uint32_t*）和uint8Array[2]），*（（uint32_t*）和uint8Array[3]）而*（（uint32\u t*）&uin

我正在尝试将uint8\u t数组转换为uint32\u t数组。然而，当我尝试这样做时，我似乎无法访问每一个连续的4字节

假设我有一个8字节的uint8_t数组。我想作为一个uint32\t访问字节2->6

它们都得到相同的值

*（（uint32_t*）和uint8Array[0]）

，

*（（uint32_t*）和uint8Array[1]）

，

*（（uint32_t*）和uint8Array[2]）

，

*（（uint32_t*）和uint8Array[3]）

而

*（（uint32\u t*）&uint8Array[4]）

按预期获取字节4->8

所以我似乎无法从任何地址访问4个连续字节

有什么方法可以做到这一点吗？

如果需要字节2..6，则必须组合多个对齐的加载以获得所需的内容

uint32_t *ptr = ...;
uint32_t value = (ptr[0] >> 16) | (ptr[1] << 16);

uint32_t*ptr=。。。；
uint32_t value=（ptr[0]>>16）|（ptr[1]如果需要字节2..6，则必须组合多个对齐的加载以获得所需的内容
uint32_t *ptr = ...;
uint32_t value = (ptr[0] >> 16) | (ptr[1] << 16);

uint32_t*ptr=。。。；
uint32_t value=（ptr[0]>>16）|（ptr[1]虽然CUDA中不允许未对齐的访问，但有一个方便的模式来模拟寄存器内未对齐读取的效果。这可以通过一点公开。如果您可以容忍读取超过数组末尾，代码就变得非常简单：
// WARNING! Reads past ptr!
__device__ uint32_t read_unaligned(void* ptr)
{
    uint32_t result;
    asm("{\n\t"
        "   .reg .b64    aligned_ptr;\n\t"
        "   .reg .b32    low, high, alignment;\n\t"
        "   and.b64      aligned_ptr, %1, 0xfffffffffffffffc;\n\t"
        "   ld.u32       low, [aligned_ptr];\n\t"
        "   ld.u32       high, [aligned_ptr+4];\n\t"
        "   cvt.u32.u64  alignment, %1;\n\t"
        "   prmt.b32.f4e %0, low, high, alignment;\n\t"
        "}"
        : "=r"(result) : "l"(ptr));
    return result;
}

为了确保通过数组末尾的访问不会造成伤害，请将分配的字节数四舍五入为4的倍数，然后再添加4个字节
上述设备代码与允许未对齐访问的little endian主机上的以下代码具有相同的效果：
__host__ uint32_t read_unaligned_host(void* ptr)
{
    return *(uint32_t*)ptr;
}

虽然CUDA中不允许未对齐的访问，但有一个方便的模式来模拟寄存器内未对齐读取的效果。这可以通过一点公开。如果您可以容忍读取超过数组末尾，则代码变得非常简单：
// WARNING! Reads past ptr!
__device__ uint32_t read_unaligned(void* ptr)
{
    uint32_t result;
    asm("{\n\t"
        "   .reg .b64    aligned_ptr;\n\t"
        "   .reg .b32    low, high, alignment;\n\t"
        "   and.b64      aligned_ptr, %1, 0xfffffffffffffffc;\n\t"
        "   ld.u32       low, [aligned_ptr];\n\t"
        "   ld.u32       high, [aligned_ptr+4];\n\t"
        "   cvt.u32.u64  alignment, %1;\n\t"
        "   prmt.b32.f4e %0, low, high, alignment;\n\t"
        "}"
        : "=r"(result) : "l"(ptr));
    return result;
}

为了确保通过数组末尾的访问不会造成伤害，请将分配的字节数四舍五入为4的倍数，然后再添加4个字节
上述设备代码与允许未对齐访问的little endian主机上的以下代码具有相同的效果：
__host__ uint32_t read_unaligned_host(void* ptr)
{
    return *(uint32_t*)ptr;
}

正如@DietrichEpp所建议的，您必须进行两次加载；并且正如@tera所建议的，您可以使用prmt
PTX指令，以便宜的价格将这两次加载组合起来，即使事先不知道失准情况（即uint8Array
的初始地址是任意的）
我将提供一个基于@tera的解决方案，让您可以：
value = read_unaligned(&uint8Array[offset]);

安全且（相对）高效。此外，它将只有一条内联PTX汇编指令，如果需要，还将有一个“不安全”变体：
#include <cstdint>
#include <cuda_runtime_api.h>

__device__ __forceinline__ uint32_t prmt_forward_4_extract(
    uint32_t first_word,
    uint32_t second_word, 
    uint32_t control_bits)
{
    uint32_t result;
    asm("prmt.b32.f4e %0, %1, %2, %3;"
        : "=r"(result)
        : "r"(first_word), "r"(second_word), "r"(control_bits) );
    return result;
}

/*
 * This unsafe, faster variant may read past the 32-bit naturally-aligned
 * word containing the last relevant byte
 */
__device__ inline uint32_t read_unaligned_unsafe(const uint32_t* __restrict__ ptr)
{
    /*
     *  Clear the bottom 2 bits of the address, making the result aligned 
     *  for the purposes of reading a 32-bit (= 4-byte) value
     */
    auto aligned_ptr  = (uint32_t*) ((uint64_t) ptr & ~((uint64_t) 0x3));
    auto first_value  = *aligned_ptr;
    auto second_value = *(aligned_ptr + 1);

    auto lower_word_of_ptr = (uint32_t)((uint64_t)(ptr));

    return prmt_forward_4_extract(first_value, second_value, lower_word_of_ptr);
}

__device__ inline uint32_t read_unaligned(const uint32_t* __restrict__ ptr)
{
    auto ptr_is_already_aligned = ((uint64_t)(ptr) & 0x3 == 0);
    if (ptr_is_already_aligned) { return *ptr; }
    return read_unaligned_unsafe(ptr);
}

#包括
#包括
__设备强制内联2前向4提取(
uint32第一个字，
uint32第二个字，
uint32（控制位）
{
uint32_t结果；
asm（“prmt.b32.f4e%0、%1、%2、%3
：“=r”（结果）
：“r”（第一个字），“r”（第二个字），“r”（控制位）；
返回结果；
}
/*
*这种不安全、速度更快的变体可能会读取超过32位自然对齐的数据
*包含最后一个相关字节的字
*/
__设备在线读未对齐不安全（常数uint32*\uuuuu限制\uuuuuu ptr）
{
/*
*清除地址底部的2位，使结果对齐
*用于读取32位（=4字节）值
*/
自动对齐ptr=（uint32_t*）（（uint64_t）ptr&~（uint64_t）0x3））；
自动第一个\u值=*对齐\u ptr；
自动秒_值=*（对齐_ptr+1）；
自动降低单词的位置=（uint32_t）（（uint64_t）（ptr））；
返回prmt_forward_4_extract（第一个_值、第二个_值、较低的_单词_的_ptr）；
}
__设备在线uint32读取未对齐（常数uint32*\uuuu限制\uuuuu ptr）
{
自动ptr_已对齐=（（uint64_t）（ptr）和0x3==0）；
如果（ptr_已_对齐）{return*ptr；}
返回读取未对齐不安全（ptr）；
}
正如@DietrichEpp所建议的，您必须进行两次加载；并且正如@tera所建议的，您可以使用prmt
PTX指令，以便宜的价格将这两次加载合并起来，即使事先不知道偏差（即uint8Array
的初始地址是任意的）
我将提供一个基于@tera的解决方案，让您可以：
value = read_unaligned(&uint8Array[offset]);

安全且（相对）高效。此外，它将只有一条内联PTX汇编指令，如果需要，还将有一个“不安全”变体：
#include <cstdint>
#include <cuda_runtime_api.h>

__device__ __forceinline__ uint32_t prmt_forward_4_extract(
    uint32_t first_word,
    uint32_t second_word, 
    uint32_t control_bits)
{
    uint32_t result;
    asm("prmt.b32.f4e %0, %1, %2, %3;"
        : "=r"(result)
        : "r"(first_word), "r"(second_word), "r"(control_bits) );
    return result;
}

/*
 * This unsafe, faster variant may read past the 32-bit naturally-aligned
 * word containing the last relevant byte
 */
__device__ inline uint32_t read_unaligned_unsafe(const uint32_t* __restrict__ ptr)
{
    /*
     *  Clear the bottom 2 bits of the address, making the result aligned 
     *  for the purposes of reading a 32-bit (= 4-byte) value
     */
    auto aligned_ptr  = (uint32_t*) ((uint64_t) ptr & ~((uint64_t) 0x3));
    auto first_value  = *aligned_ptr;
    auto second_value = *(aligned_ptr + 1);

    auto lower_word_of_ptr = (uint32_t)((uint64_t)(ptr));

    return prmt_forward_4_extract(first_value, second_value, lower_word_of_ptr);
}

__device__ inline uint32_t read_unaligned(const uint32_t* __restrict__ ptr)
{
    auto ptr_is_already_aligned = ((uint64_t)(ptr) & 0x3 == 0);
    if (ptr_is_already_aligned) { return *ptr; }
    return read_unaligned_unsafe(ptr);
}

#包括
#包括
__设备强制内联2前向4提取(
uint32第一个字，
uint32第二个字，
uint32（控制位）
{
uint32_t结果；
asm（“prmt.b32.f4e%0、%1、%2、%3
：“=r”（结果）
：“r”（第一个字），“r”（第二个字），“r”（控制位）；
返回结果；
}
/*
*这种不安全、速度更快的变体可能会读取超过32位自然对齐的数据
*包含最后一个相关字节的字
*/
__设备在线读未对齐不安全（常数uint32*\uuuuu限制\uuuuuu ptr）
{
/*
*清除地址底部的2位，使结果对齐
*用于读取32位（=4字节）值
*/
自动对齐ptr=（uint32_t*）（（uint64_t）ptr&~（uint64_t）0x3））；
自动第一个\u值=*对齐\u ptr；
自动秒_值=*（对齐_ptr+1）；
自动降低单词的位置=（uint32_t）（（uint64_t）（ptr））；
返回prmt_forward_4_extract（第一个_值、第二个_值、较低的_单词_的_ptr）；
}
__设备在线uint32读取未对齐（常数uint32*\uuuu限制\uuuuu ptr）
{
自动ptr_已对齐=（（uint64_t）（ptr）和0x3==0）；
如果（ptr_已_对齐）{return*ptr；}
返回读取未对齐不安全（ptr）；
}
您不能这样做。您只能访问正确对齐的单词。（有些处理器可能会允许您这样做，但这是处理器额外的工作，有些处理器不会，C语言也不要求他们这样做。）如果您想以可移植的方式进行未对齐的访问，您必须