CUDA：未签名字符上的原子操作_Cuda_Atomic

CUDA：未签名字符上的原子操作

cuda

CUDA：未签名字符上的原子操作,cuda,atomic,Cuda,Atomic,我是CUDA的初学者。我在全局内存中有一个无符号字符的像素缓冲区，可以由任何线程更新。因此，为了避免像素值的奇怪，我想在线程尝试更新像素值时执行atomicExch。但是编程指南说这个函数只在32位或64位的字上工作，而我只想原子地交换一个8位字节。有办法做到这一点吗谢谢。您可以使用互斥变量实现关键部分。大概是 get_the_lock exch_data release 我最近遇到了这个问题。理论上，原子操作/乐观重试应该比锁/互斥锁更快，因此对其他数据类型使用原子操作的“黑客”解决方

我是CUDA的初学者。我在全局内存中有一个无符号字符的像素缓冲区，可以由任何线程更新。因此，为了避免像素值的奇怪，我想在线程尝试更新像素值时执行atomicExch。但是编程指南说这个函数只在32位或64位的字上工作，而我只想原子地交换一个8位字节。有办法做到这一点吗

谢谢。

您可以使用互斥变量实现关键部分。大概是

get_the_lock
exch_data
release

我最近遇到了这个问题。理论上，原子操作/乐观重试应该比锁/互斥锁更快，因此对其他数据类型使用原子操作的“黑客”解决方案在我看来比使用关键部分更好

下面是一些基于和线程的实现

我已经测试了所有这些，并且我的测试似乎表明它们到目前为止运行良好

用于字符的atomicAdd版本1

__device__ static inline char atomicAdd(char* address, char val) {
    // offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
    size_t long_address_modulo = (size_t) address & 3;
    // the 32-bit address that overlaps the same memory
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    // A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
    // The "4" signifies the position where the first byte of the second argument will end up in the output.
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
    // for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
    unsigned int selector = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;

    long_old = *base_address;

    do {
        long_assumed = long_old;
        // replace bits in long_old that pertain to the char address with those from val
        long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
        replacement = __byte_perm(long_old, long_val, selector);
        long_old = atomicCAS(base_address, long_assumed, replacement);
    } while (long_old != long_assumed);
    return __byte_perm(long_old, 0, long_address_modulo);
}

__device__ static inline char atomicCAS(char* address, char expected, char desired) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};

    unsigned int sel = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;
    char old;

    long_val = (unsigned int) desired;
    long_old = *base_address;
    do {
        long_assumed = long_old;
        replacement = __byte_perm(long_old, long_val, sel);
        long_old = atomicCAS(base_address, long_assumed, replacement);
        old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
    } while (expected == old && long_assumed != long_old);

    return old;
}

字符的原子库

__device__ static inline char atomicAdd(char* address, char val) {
    // offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
    size_t long_address_modulo = (size_t) address & 3;
    // the 32-bit address that overlaps the same memory
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    // A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
    // The "4" signifies the position where the first byte of the second argument will end up in the output.
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
    // for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
    unsigned int selector = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;

    long_old = *base_address;

    do {
        long_assumed = long_old;
        // replace bits in long_old that pertain to the char address with those from val
        long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
        replacement = __byte_perm(long_old, long_val, selector);
        long_old = atomicCAS(base_address, long_assumed, replacement);
    } while (long_old != long_assumed);
    return __byte_perm(long_old, 0, long_address_modulo);
}

__device__ static inline char atomicCAS(char* address, char expected, char desired) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};

    unsigned int sel = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;
    char old;

    long_val = (unsigned int) desired;
    long_old = *base_address;
    do {
        long_assumed = long_old;
        replacement = __byte_perm(long_old, long_val, sel);
        long_old = atomicCAS(base_address, long_assumed, replacement);
        old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
    } while (expected == old && long_assumed != long_old);

    return old;
}

用于字符的atomicAdd版本2（使用位移位而不是字节移位，因此必须处理溢出）

\uuuu设备\uuuu静态内联字符atomicAdd2（字符*地址，字符值）{
大小长地址模=（大小）地址&3；
自动*基地址=（无符号整数*）（（字符*）地址-长地址（模）；
无符号整数long_val=（无符号整数）val>24）；
}否则{
//表示长值内字符值的位
无符号整数掩码=0x000000ff>8*长地址（模）；
}
}

有关atomicMin，请检查。

在其

atomicCAS（）的实现中存在错误。

。这个版本适合我：

__device__
static inline
uint8_t
atomicCAS( uint8_t * const address,
           uint8_t   const compare,
           uint8_t   const value )
{
    // Determine where in a byte-aligned 32-bit range our address of 8 bits occurs.
    uint8_t    const     longAddressModulo = reinterpret_cast< size_t >( address ) & 0x3;
    // Determine the base address of the byte-aligned 32-bit range that contains our address of 8 bits.
    uint32_t * const     baseAddress       = reinterpret_cast< uint32_t * >( address - longAddressModulo );
    uint32_t   constexpr byteSelection[]   = { 0x3214, 0x3240, 0x3410, 0x4210 }; // The byte position we work on is '4'.
    uint32_t   const     byteSelector      = byteSelection[ longAddressModulo ];
    uint32_t   const     longCompare       = compare;
    uint32_t   const     longValue         = value;
    uint32_t             longOldValue      = * baseAddress;
    uint32_t             longAssumed;
    uint8_t              oldValue;

    do
    {
        // Select bytes from the old value and new value to construct a 32-bit value to use.
        uint32_t const replacement = __byte_perm( longOldValue, longValue,   byteSelector );
        uint32_t const comparison  = __byte_perm( longOldValue, longCompare, byteSelector );

        longAssumed  = longOldValue;
        // Use 32-bit atomicCAS() to try and set the 8-bits we care about.
        longOldValue = ::atomicCAS( baseAddress, comparison, replacement );
        // Grab the 8-bit portion we care about from the old value at address.
        oldValue     = ( longOldValue >> ( 8 * longAddressModulo )) & 0xFF;
    }
    while ( compare == oldValue and longAssumed != longOldValue ); // Repeat until other three 8-bit values stabilize.

    return oldValue;
}

\u设备__
内联函数
uint8\u t
原子库（uint8_*const地址，
uint8_t const compare，
uint8（常数值）
{
//确定在字节对齐的32位范围内，8位地址出现的位置。
uint8\u t const longAddressModulo=重新解释强制转换（地址）&0x3；
//确定包含8位地址的字节对齐32位范围的基址。
uint32\u t*const baseAddress=重新解释转换（地址-长地址模）；
uint32_t constexpr byteSelection[]={0x3214，0x3240，0x3410，0x4210}；//我们处理的字节位置是'4'。
uint32_t const byteSelector=byteSelection[longAddressModulo]；
uint32_t const longCompare=比较；
uint32_t const longValue=值；
uint32_t longOldValue=*基址；
uint32_t长假设；
uint8_t旧值；
做
    {
//从旧值和新值中选择字节，以构造要使用的32位值。
uint32常量替换=uu字节u perm（longOldValue、longValue、byteSelector）；
uint32常量比较=uuu字节u perm（longOldValue、longCompare、byteSelector）；
longOldValue=longOldValue；
//使用32位atomicCAS（）尝试设置我们关心的8位。
longOldValue=：：atomicCAS（基址、比较、替换）；
//从地址处的旧值中获取我们关心的8位部分。
oldValue=（longOldValue>>（8*longAddressModulo））&0xFF；
    }
while（compare==oldValue和longAspected！=longOldValue）；//重复，直到其他三个8位值稳定为止。
返回旧值；
}

更新内容是什么？如果你想将一个数字翻转为0或1，你可以使用原子和/或，我会根据像素中“驻留”的对象数量增加像素的值。因此，如果每个对象的增量为50，那么一个对象将生成一个RGBA为（50,50,50,50）的像素，而两个对象的RGBA（100100100）等最大值为（255255）。这使我能够根据“在”该像素内的对象数量来改变像素的强度。我发现了一个漏洞，顺便说一句，由于uchar4占用的空间与int相同（虽然不能保证，但它在我的体系结构上工作），我只需要获取uchar4的地址，将其转换为（int*），然后使用atomicExch的整数版本。我仍然很感兴趣的是，你们是否可以只用一个字节来做原子学，尽管。。。