C++ 具有释放-获取内存排序的半无锁spsc队列_C++_Concurrency_Atomic_Memory Barriers

C++ 具有释放-获取内存排序的半无锁spsc队列

c++ concurrency

C++ 具有释放-获取内存排序的半无锁spsc队列,c++,concurrency,atomic,memory-barriers,C++,Concurrency,Atomic,Memory Barriers,下面是一个半无锁spsc队列的简化实现。我有意将其简化为更易于阅读和重现死锁问题问题在于write\u pos\u和read\u pos\u的内存顺序与enqueuer\u in\u sleep\u和dequeuer\u in\u sleep\u有关，这在我在代码中评论的某些情况下可能导致死锁问题: 有什么解决方案可以解决此问题并保持write\u pos\u和read\u pos\u释放获取上操作的内存顺序？使用信号量、条件变量和c++20原子通知等待函数与使用seq_cst内存排序具有相

下面是一个半无锁spsc队列的简化实现。我有意将其简化为更易于阅读和重现死锁问题

问题在于

write\u pos\u

和

read\u pos\u

的内存顺序与

enqueuer\u in\u sleep\u和dequeuer\u in\u sleep\u
有关，这在我在代码中评论的某些情况下可能导致死锁
问题:
有什么解决方案可以解决此问题并保持write\u pos\u
和read\u pos\u
释放获取上操作的内存顺序？使用信号量、条件变量和c++20原子通知等待函数与使用seq_cst内存排序具有相同的性能降级
是否有任何可行的方法加载写入位置
和读取位置
，并确保它们包含具有释放内存顺序的最后存储值？（类似于添加某种延迟）
因为这个问题发生在边缘情况下，所以我在代码注释中提到的延迟解决方案可以保持发布获取顺序，并且队列的性能可以与无锁队列相媲美（接近每秒400米的排队/出队）。使用任何轻量级信号量都会将性能降低到每秒6000万的排队/出列。这种性能下降是因为信号量在每次release（）acquire（）调用中使用xchg，从而导致使用处理器的锁定协议吗
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
静态无效等待地址（std:：atomic*ptr，uint32\t val）
{
做
{
syscall（SYS_futex，reinterpret_cast（ptr），futex_WAIT_PRIVATE，val，NULL，0，0）；
}同时（ptr->load（std:：memory_order_acquire）=val）；/*检查是否存在虚假唤醒*/
}
静态无效唤醒地址（std:：atomic*ptr）
{
syscall（SYS_futex，reinterpret_cast（ptr），futex_WAKE_PRIVATE，1,0,0）；
}
模板
类spsc
{
alignas（64）std:：数组缓冲区{}；
alignas（64）std:：原子写入位置{}；
alignas（64）std:：原子读取位置{}；
alignas（64）std:：原子排队器在睡眠中；
alignas（64）std:：原子出列器在睡眠中；
公众：
无效排队（常量T和值）
{
常量自动写入位置=写入位置加载（标准：：内存顺序松弛）；
自动下一步写入位置=1+写入位置；
if（next_write_pos==缓冲区大小（））
下一步写入位置=0；
uint8_t重试次数=0；
while（next_write_pos==read_pos_.load（std:：memory_order_acquire））
{
如果（++重试次数>16次）
{
/*
假设在下一行中更改enqueuer_in_sleep_之前
线程进入dequeue（）并更新了read\u pos。当我们在睡眠中更改排队器时_
若要设置为true并到达等待地址（），则已更改读取位置并
我们应该立即从内核返回，但情况并非总是这样
因为其他线程使用释放内存顺序更新了读取位置，所以这是可能的
我们读取了它以前的值，并等待\u上的\u address（）在
队列中还有个项目。
如果我们将读取位置上的存储顺序从释放更改为顺序cst
这个问题会得到解决，但总体上会多次下降。
*/
在睡眠存储中排队（true，std:：memory\u order\u seq\u cst）；
/*
如果我们在此添加一些延迟：
std:：this_thread:：sleep_for（std:：chrono:：微秒{1}）；
当我们到达等待地址（）时，我们可以确保（在某种程度上）我们正在阅读
读取位置上的最后一个存储（相对于睡眠中更改排队器的最后一个存储）
以这种方式解决问题会保留发布获取顺序，但不会
这似乎是一个可行的解决办法。
*/
等待地址（&读取位置，下一个写入位置）；
在睡眠存储中排队（false，std:：memory\u order\u seq\u cst）；
打破
}
}
缓冲区\写入\位置】=值；
写入位置存储（下一个写入位置，标准：：内存、订单和发布）；
if（出列器在睡眠中加载（std:：内存顺序cst））
{
按地址唤醒（&write位置）；
}
}
无效出列（T和值）
{
自动常量读取位置=读取位置加载（标准：：内存顺序松弛）；
uint8_t重试次数=0；
while（read_pos==write_pos_.load（std:：memory_order_acquire））
{
如果（++重试次数>16次）
{
在睡眠存储中退出队列（true，std:：memory\u order\u seq\u cst）；
/*
同样的问题也可能发生在这里。
*/
等待地址（&写入位置，读取位置）；
在睡眠存储中退出队列（false，std:：memory\u order\u seq\u cst）；
打破
}
}
值=缓冲区\读取\位置]；
自动下一步读取位置=读取位置+1；
if（next_read_pos==缓冲区大小（））
下一个读取位置=0；
读取位置存储（下一个读取位置，标准：：内存、订单和释放）；
if（排队器处于睡眠状态加载（std:：内存顺序cst））
{
按地址唤醒（读取位置（&R）；
}
}
};
int main（）
{
自动尝试=0；
对于（；；）
{
constexpr uint64_t迭代次数=100'000；
spsc-spsc；
标准：螺纹t（[&]{
对于（uint64_t i=0；i#include <array>
#include <atomic>
#include <chrono>
#include <iostream>
#include <thread>

#include <linux/futex.h>
#include <sys/syscall.h>
#include <unistd.h>

static void wait_on_address(std::atomic<uint32_t>* ptr, uint32_t val)
{
    do
    {
        syscall(SYS_futex, reinterpret_cast<uint32_t*>(ptr), FUTEX_WAIT_PRIVATE, val, NULL, 0, 0);
    } while (ptr->load(std::memory_order_acquire) == val); /* check for spurious wakeup */
}

static void wake_by_address(std::atomic<uint32_t>* ptr)
{
    syscall(SYS_futex, reinterpret_cast<uint32_t*>(ptr), FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
}

template<typename T, uint32_t S>
class spsc
{
    alignas(64) std::array<T, S> buffer_{};
    alignas(64) std::atomic<uint32_t> write_pos_{};
    alignas(64) std::atomic<uint32_t> read_pos_{};
    alignas(64) std::atomic<bool> enqueuer_in_sleep_{};
    alignas(64) std::atomic<bool> dequeuer_in_sleep_{};

  public:
    void enqueue(const T& value)
    {
        const auto write_pos = write_pos_.load(std::memory_order_relaxed);

        auto next_write_pos = 1 + write_pos;
        if (next_write_pos == buffer_.size())
            next_write_pos = 0;

        uint8_t retries = 0;
        while (next_write_pos == read_pos_.load(std::memory_order_acquire))
        {
            if (++retries > 16)
            {
                /*
                    suppose that, before we change enqueuer_in_sleep_ in the next line, the other
                    thread entered dequeue() and updated read_pos_. when we change enqueuer_in_sleep_
                    to true and reach to wait_on_address(), read_pos_ is already changed and
                    we should return immediately from the kernel, but that's not always the case
                    because other thread updated read_pos_ with release memory_order it is possible
                    we read its previous value and wait_on_address() would go to sleep while
                    there is/are item(s) left in the queue.

                    if we change memory_order of store on read_pos_ from release to seq_cst
                    the problem would be solved but overall profromance drops multiple times.
                */
                enqueuer_in_sleep_.store(true, std::memory_order_seq_cst);
                /*
                    if we add some delay here:
                    std::this_thread::sleep_for(std::chrono::microseconds{1});

                    when we reach to wait_on_address() we can make sure (to some degree) we are reading
                    last store on read_pos_ (last store relative to changing enqueuer_in_sleep_)
                    solving the problem in this way preserves release-acquire ordering but it doesn't
                    seem to be a viable solution.
                */
                wait_on_address(&read_pos_, next_write_pos);
                enqueuer_in_sleep_.store(false, std::memory_order_seq_cst);
                break;
            }
        }

        buffer_[write_pos] = value;

        write_pos_.store(next_write_pos, std::memory_order_release);

        if (dequeuer_in_sleep_.load(std::memory_order_seq_cst))
        {
            wake_by_address(&write_pos_);
        }
    }

    void dequeue(T& value)
    {
        auto const read_pos = read_pos_.load(std::memory_order_relaxed);

        uint8_t retries = 0;
        while (read_pos == write_pos_.load(std::memory_order_acquire))
        {
            if (++retries > 16)
            {
                dequeuer_in_sleep_.store(true, std::memory_order_seq_cst);
                /*
                    same problem can happen here.
                */
                wait_on_address(&write_pos_, read_pos);
                dequeuer_in_sleep_.store(false, std::memory_order_seq_cst);
                break;
            }
        }

        value = buffer_[read_pos];

        auto next_read_pos = read_pos + 1;
        if (next_read_pos == buffer_.size())
            next_read_pos = 0;

        read_pos_.store(next_read_pos, std::memory_order_release);

        if (enqueuer_in_sleep_.load(std::memory_order_seq_cst))
        {
            wake_by_address(&read_pos_);
        }
    }
};

int main()
{
    auto attempts = 0;
    for (;;)
    {
        constexpr uint64_t iterations = 100'000;
        spsc<uint64_t, 4> spsc;

        std::thread t([&] {
            for (uint64_t i = 0; i < iterations; i++)
            {
                uint64_t v;
                spsc.dequeue(v);
            }
        });

        for (uint64_t i = 0; i < iterations; i++)
        {
            spsc.enqueue(i);
        }

        t.join();

        std::cout << "attempts to deadlock: " << ++attempts << std::endl;
    };
}