C++ Boost-Asio-async\u-read有时在读取时挂起,但并不总是挂起

C++ Boost-Asio-async\u-read有时在读取时挂起,但并不总是挂起,c++,boost,boost-asio,C++,Boost,Boost Asio,我正在实现一个由N台机器组成的小型分布式系统。它们中的每一个都从某个远程服务器接收一些数据,然后将数据传播到其他n-1台机器。我正在使用Boost Asio异步读取和异步写入来实现这一点。我设置了一个由N=30台机器组成的测试集群。当我尝试更小的日期集(每台机器接收75KB到750KB)时,程序总是有效的。但当我转到稍大一点的数据集(7.5MB)时,我观察到了奇怪的行为:开始时,读写操作按预期进行,但过了一段时间,一些机器挂起,而另一些机器完成,挂起的机器数量随着每次运行而变化。我试图在每个处理

我正在实现一个由N台机器组成的小型分布式系统。它们中的每一个都从某个远程服务器接收一些数据,然后将数据传播到其他n-1台机器。我正在使用Boost Asio异步读取和异步写入来实现这一点。我设置了一个由N=30台机器组成的测试集群。当我尝试更小的日期集(每台机器接收75KB到750KB)时,程序总是有效的。但当我转到稍大一点的数据集(7.5MB)时,我观察到了奇怪的行为:开始时,读写操作按预期进行,但过了一段时间,一些机器挂起,而另一些机器完成,挂起的机器数量随着每次运行而变化。我试图在每个处理程序中打印出一些消息,发现对于那些挂起的机器,async_read在一段时间后基本上无法成功读取,因此之后无法继续。我检查了远程服务器,他们都写完了。我已经尝试使用strand来控制异步读写的执行顺序,还尝试使用不同的io_服务进行读写。他们都没有解决这个问题。我非常绝望。有人能帮我吗

下面是执行读取和传播的类的代码:

const int TRANS_TUPLE_SIZE=15;
const int TRANS_BUFFER_SIZE=5120/TRANS_TUPLE_SIZE*TRANS_TUPLE_SIZE;
class Asio_Trans_Broadcaster
{
private:
   char buffer[TRANS_BUFFER_SIZE];
   int node_id;
   int mpi_size;
   int mpi_rank;
   boost::asio::ip::tcp::socket* dbsocket;
   boost::asio::ip::tcp::socket** sender_sockets;
   int n_send;
   boost::mutex mutex;
   bool done;
public:
   Asio_Trans_Broadcaster(boost::asio::ip::tcp::socket* dbskt, boost::asio::ip::tcp::socket** senderskts,
        int msize, int mrank, int id)
{
    dbsocket=dbskt;
    count=0;
    node_id=id;
    mpi_size=mpi_rank=-1;
    sender_sockets=senderskts;
    mpi_size=msize;
    mpi_rank=mrank;
    n_send=-1;
    done=false;
}

static std::size_t completion_condition(const boost::system::error_code& error, std::size_t bytes_transferred)
{
    int remain=bytes_transferred%TRANS_TUPLE_SIZE;
    if(remain==0 && bytes_transferred>0)
        return 0;
    else
        return TRANS_BUFFER_SIZE-bytes_transferred;
}


void write_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
    int n=-1;
    mutex.lock();
    n_send--;
    n=n_send;
    mutex.unlock();
    fprintf(stdout, "~~~~~~ @%d, write_handler: %d bytes, copies_to_send: %d\n",
                                    node_id, bytes_transferred, n);
    if(n==0 && !done)
        boost::asio::async_read(*dbsocket,
            boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
            Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
            boost::asio::placeholders::error,
            boost::asio::placeholders::bytes_transferred));
}

void broadcast_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
    fprintf(stdout, "@%d, broadcast_handler: %d bytes, mpi_size:%d, mpi_rank: %d\n", node_id, bytes_transferred, mpi_size, mpi_rank);
    if (!ec)
    {
        int pos=0;
        while(pos<bytes_transferred && pos<TRANS_BUFFER_SIZE)
        {
            int id=-1;
            memcpy(&id, &buffer[pos], 4);
            if(id<0)
            {
                done=true;
                fprintf(stdout, "@%d, broadcast_handler: done!\n", mpi_rank);
                break;
            }

            pos+=TRANS_TUPLE_SIZE;
        }

        mutex.lock();
        n_send=mpi_size-1;
        mutex.unlock();
        for(int i=0; i<mpi_size; i++)
            if(i!=mpi_rank)
            {
                boost::asio::async_write(*sender_sockets[i], boost::asio::buffer(buffer, bytes_transferred),
                                boost::bind(&Asio_Trans_Broadcaster::write_handler, this,
                                boost::asio::placeholders::error,
                                boost::asio::placeholders::bytes_transferred));
            }
    }
    else
    {
        cerr<<mpi_rank<<" error: "<<ec.message()<<endl;
      delete this;
    }


}

void broadcast()
{
    boost::asio::async_read(*dbsocket,
            boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
            Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
            boost::asio::placeholders::error,
            boost::asio::placeholders::bytes_transferred));
}
};
const int TRANS\u TUPLE\u SIZE=15;
const int TRANS_BUFFER_SIZE=5120/TRANS_TUPLE_SIZE*TRANS_TUPLE_SIZE;
Asio_类Trans_广播公司
{
私人:
字符缓冲区[传输缓冲区大小];
int node_id;
int mpi_尺寸;
int mpi_等级;
boost::asio::ip::tcp::socket*dbsocket;
boost::asio::ip::tcp::socket**sender\u sockets;
int n_send;
互斥互斥;
布尔多;
公众:
Asio_Trans_广播(boost::Asio::ip::tcp::socket*dbskt,boost::Asio::ip::tcp::socket**senderskt,
int msize、int mrank、int id)
{
dbsocket=dbskt;
计数=0;
节点_id=id;
mpi_大小=mpi_等级=-1;
发送方_sockets=senderskts;
mpi_size=msize;
mpi_秩=mrank;
n_send=-1;
完成=错误;
}
静态std::size\u t completion\u条件(const boost::system::error\u code&error,std::size\u t bytes\u transfer)
{
int remain=传输的字节数%TRANS\u TUPLE\u大小;
如果(保持==0&&bytes\u传输>0)
返回0;
其他的
返回传输缓冲区大小-传输的字节数;
}
无效写入处理程序(常量boost::system::error\u code&ec,std::size\u t字节\u传输)
{
int n=-1;
mutex.lock();
n_发送--;
n=n_发送;
mutex.unlock();
fprintf(标准输出,“~~~~~@%d,写入处理程序:%d字节,复制到发送:%d\n”,
节点标识,传输的字节数,n);
如果(n==0&&!完成)
boost::asio::异步读取(*dbsocket,
boost::asio::buffer(缓冲区、传输缓冲区大小),
Asio_Trans_Broadcaster::completion_条件,boost::bind(&Asio_Trans_Broadcaster::broadcast_处理程序,此,
boost::asio::占位符::错误,
boost::asio::占位符::字节(已传输);
}
无效广播处理程序(常量boost::系统::错误代码和ec,std::大小\u t字节\u传输)
{
fprintf(标准输出“@%d,广播处理程序:%d字节,mpi\U大小:%d,mpi\U秩:%d\n”,节点id,传输的字节,mpi\U大小,mpi\U秩);
如果(!ec)
{
int pos=0;

虽然(pos我不知道你的代码想要实现什么。丢失的位太多了

当然,如果任务是在网络套接字上异步发送/接收通信量,Asio就是其中之一。很难看出代码的特殊之处

我建议解决更明显的问题:

  • (几乎)没有错误处理(检查您的
    错误\u代码
    -s!)
  • 除非您在一个有趣的平台上,否则格式字符串应该使用
    %lu
    表示
    大小\u t
  • 当你只需要一个向量的时候,你为什么还要摆弄原始数组,它们的大小可能不好
  • 如果可以使用sizeof,则永远不要假定对象的大小:

    memcpy(&id, &trans_buffer[pos], sizeof(id));
    
  • 想想看,缓冲区的索引似乎无论如何都是不安全的:

        while(pos < bytes_transferred && pos < TRANS_BUFFER_SIZE)
        {
            int id = -1;
            memcpy(&id, &buffer[pos], sizeof(id));
    
    所以你可以简单地写:

    service_wrap senders(NUM_THREADS);
    service_wrap receivers(1);
    
    哇。你看到了吗?不再有出错的机会。如果你修复了一个池,你会自动修复另一个池。不再删除第一个池,
    .reset()
    第二个
    工作项。简言之:不再有凌乱的代码,复杂性更低

  • 使用例外安全锁定装置:

    int local_n_send = -1; // not clear naming
    {
        boost::lock_guard<boost::mutex> lk(mutex);
        n_send--;
        local_n_send = n_send;
    }
    
  • 我认为仍然存在一个竞争条件-对
    n_send
    本身的访问没有数据竞争,但是如果
    n_send
    在锁释放后达到零,则重新广播的决定可能是错误的
只执行异步操作,您只需在锁定状态下执行该操作,即可摆脱竞争条件:

void write_handler(const error_code &ec, size_t bytes_transferred) {
    boost::lock_guard<boost::mutex> lk(mutex);

    if(!(done || --n_send))
        broadcast();
}
void write\u处理程序(常量错误\u代码和ec,大小\u t字节\u传输){
boost::lock_guard lk(互斥);
如果(!(完成| |--n|u发送))
广播();
}
呜呜呜。这是三行代码。代码越少,bug就越少

我的猜测是,如果你像这样认真地清理代码,你将不可避免地找到你的线索。想象一下,你会寻找一枚丢失的结婚戒指:你不会留下一堆乱七八糟的东西。相反,你会从一个房间到另一个房间,把所有的东西都收拾干净。如果需要的话,先把所有的东西“扔”出去

如果您可以让这个东西独立/和/或可复制,我甚至会为您进一步调试它

干杯

下面是我在查看代码时提出的一个起点:

#include <boost/asio.hpp>
#include <boost/thread.hpp>
#include <boost/array.hpp>
#include <boost/make_shared.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <iostream>

const/*expr*/ int TRANS_TUPLE_SIZE  = 15;
const/*expr*/ int TRANS_BUFFER_SIZE = 5120 / TRANS_TUPLE_SIZE * TRANS_TUPLE_SIZE;

namespace AsioTrans
{
    using boost::system::error_code;
    using namespace boost::asio;

    typedef ip::tcp::socket             socket_t;
    typedef boost::ptr_vector<socket_t> socket_list;

    class Broadcaster
    {
    private:
        boost::array<char, TRANS_BUFFER_SIZE> trans_buffer;

        int node_id;
        int mpi_rank;

        socket_t&    dbsocket;
        socket_list& sender_sockets;

        int n_send;
        boost::mutex mutex;
        bool done;
    public:
        Broadcaster(
            socket_t& dbskt,
            socket_list& senderskts,
            int mrank,
            int id) : 
                node_id(id),
                mpi_rank(mrank),
                dbsocket(dbskt),
                sender_sockets(senderskts),
                n_send(-1),
                done(false)
        {
            // count=0;
        }

        static size_t completion_condition(const error_code& error, size_t bytes_transferred)
        {
            // TODO FIXME handler error_code here
            int remain = bytes_transferred % TRANS_TUPLE_SIZE;

            if(bytes_transferred && !remain)
            {
                return 0;
            }
            else
            {
                return TRANS_BUFFER_SIZE - bytes_transferred;
            }
        }

        void write_handler(const error_code &ec, size_t bytes_transferred)
        {
            // TODO handle errors
            // TODO check bytes_transferred
            boost::lock_guard<boost::mutex> lk(mutex);

            if(!(done || --n_send))
                broadcast();
        }

        void broadcast_handler(const error_code &ec, size_t bytes_transferred)
        {
            fprintf(stdout, "@%d, broadcast_handler: %lu bytes, mpi_size:%lu, mpi_rank: %d\n", node_id, bytes_transferred, sender_sockets.size(), mpi_rank);

            if(!ec)
            {
                for(size_t pos = 0; (pos < bytes_transferred && pos < TRANS_BUFFER_SIZE); pos += TRANS_TUPLE_SIZE)
                {
                    int id = -1;
                    memcpy(&id, &trans_buffer[pos], sizeof(id));

                    if(id < 0)
                    {
                        done = true;
                        fprintf(stdout, "@%d, broadcast_handler: done!\n", mpi_rank);
                        break;
                    }
                }

                {
                    boost::lock_guard<boost::mutex> lk(mutex);
                    n_send = sender_sockets.size() - 1;
                }

                for(int i = 0; size_t(i) < sender_sockets.size(); i++)
                {
                    if(i != mpi_rank)
                    {
                        async_write(
                                sender_sockets[i], 
                                buffer(trans_buffer, bytes_transferred),
                                boost::bind(&Broadcaster::write_handler, this, placeholders::error, placeholders::bytes_transferred));
                    }
                }
            }
            else
            {
                std::cerr << mpi_rank << " error: " << ec.message() << std::endl;
                delete this;
            }
        }

        void broadcast()
        {
            async_read(
                    dbsocket,
                    buffer(trans_buffer),
                    Broadcaster::completion_condition, 
                    boost::bind(&Broadcaster::broadcast_handler, this,
                        placeholders::error,
                        placeholders::bytes_transferred));
        }
    };

    struct service_wrap {
        service_wrap(int threads) {
            while(threads--)
                _pool.create_thread(boost::bind(&io_service::run, boost::ref(_service)));
        }

        ~service_wrap() {
            _service.post(boost::bind(&service_wrap::stop, this));
            _pool.join_all();
        }

        io_service& service() { return _service; }

    private: // mind the initialization order!
        io_service                        _service;
        boost::optional<io_service::work> _work;
        boost::thread_group               _pool;

        void stop() { 
            _work = boost::none;
        }
    };

    extern void AsioConnectToRemote(int, int, io_service&, socket_t&, bool);
    extern void SetupAsioConnectionsWIthOthers(io_service&, socket_list&, std::string, int, bool);
}

int main()
{
    using namespace AsioTrans;

    // there's no use in increasing #threads unless there are blocking operations
    service_wrap senders(boost::thread::hardware_concurrency()); 
    service_wrap receivers(1);

    socket_t receiver_socket(receivers.service());
    AsioConnectToRemote(5000, 1, receivers.service(), receiver_socket, true);

    socket_list send_sockets(30);
    /*hadoopNodes =*/ SetupAsioConnectionsWIthOthers(senders.service(), send_sockets, "hostFileName", 3000, false);

    int mpi_rank = send_sockets.size();
    AsioTrans::Broadcaster db_receiver(receiver_socket, send_sockets, mpi_rank, mpi_rank);
    db_receiver.broadcast();
}
#包括
#包括
#包括
#包括
#包括
#包括
const/*expr*/int TRANS\u TUPLE\u SIZE=15;
const/*expr*/int TRANS\u BUFFER\u SIZE=5120/TRANS\u TUPLE\u SIZE*TRANS\u TUPLE\u SIZE;
名称空间AsioTrans
{
使用boost::system::error\u代码;
使用名称空间boost::asio;
typedef ip::tcp::socket-socket\t;
typedef boost::ptr_向量套接字列表;
级别广播员
{
私人:
boost::数组
if(local_n_send == 0 && !done)
    broadcast();
void write_handler(const error_code &ec, size_t bytes_transferred) {
    boost::lock_guard<boost::mutex> lk(mutex);

    if(!(done || --n_send))
        broadcast();
}
#include <boost/asio.hpp>
#include <boost/thread.hpp>
#include <boost/array.hpp>
#include <boost/make_shared.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <iostream>

const/*expr*/ int TRANS_TUPLE_SIZE  = 15;
const/*expr*/ int TRANS_BUFFER_SIZE = 5120 / TRANS_TUPLE_SIZE * TRANS_TUPLE_SIZE;

namespace AsioTrans
{
    using boost::system::error_code;
    using namespace boost::asio;

    typedef ip::tcp::socket             socket_t;
    typedef boost::ptr_vector<socket_t> socket_list;

    class Broadcaster
    {
    private:
        boost::array<char, TRANS_BUFFER_SIZE> trans_buffer;

        int node_id;
        int mpi_rank;

        socket_t&    dbsocket;
        socket_list& sender_sockets;

        int n_send;
        boost::mutex mutex;
        bool done;
    public:
        Broadcaster(
            socket_t& dbskt,
            socket_list& senderskts,
            int mrank,
            int id) : 
                node_id(id),
                mpi_rank(mrank),
                dbsocket(dbskt),
                sender_sockets(senderskts),
                n_send(-1),
                done(false)
        {
            // count=0;
        }

        static size_t completion_condition(const error_code& error, size_t bytes_transferred)
        {
            // TODO FIXME handler error_code here
            int remain = bytes_transferred % TRANS_TUPLE_SIZE;

            if(bytes_transferred && !remain)
            {
                return 0;
            }
            else
            {
                return TRANS_BUFFER_SIZE - bytes_transferred;
            }
        }

        void write_handler(const error_code &ec, size_t bytes_transferred)
        {
            // TODO handle errors
            // TODO check bytes_transferred
            boost::lock_guard<boost::mutex> lk(mutex);

            if(!(done || --n_send))
                broadcast();
        }

        void broadcast_handler(const error_code &ec, size_t bytes_transferred)
        {
            fprintf(stdout, "@%d, broadcast_handler: %lu bytes, mpi_size:%lu, mpi_rank: %d\n", node_id, bytes_transferred, sender_sockets.size(), mpi_rank);

            if(!ec)
            {
                for(size_t pos = 0; (pos < bytes_transferred && pos < TRANS_BUFFER_SIZE); pos += TRANS_TUPLE_SIZE)
                {
                    int id = -1;
                    memcpy(&id, &trans_buffer[pos], sizeof(id));

                    if(id < 0)
                    {
                        done = true;
                        fprintf(stdout, "@%d, broadcast_handler: done!\n", mpi_rank);
                        break;
                    }
                }

                {
                    boost::lock_guard<boost::mutex> lk(mutex);
                    n_send = sender_sockets.size() - 1;
                }

                for(int i = 0; size_t(i) < sender_sockets.size(); i++)
                {
                    if(i != mpi_rank)
                    {
                        async_write(
                                sender_sockets[i], 
                                buffer(trans_buffer, bytes_transferred),
                                boost::bind(&Broadcaster::write_handler, this, placeholders::error, placeholders::bytes_transferred));
                    }
                }
            }
            else
            {
                std::cerr << mpi_rank << " error: " << ec.message() << std::endl;
                delete this;
            }
        }

        void broadcast()
        {
            async_read(
                    dbsocket,
                    buffer(trans_buffer),
                    Broadcaster::completion_condition, 
                    boost::bind(&Broadcaster::broadcast_handler, this,
                        placeholders::error,
                        placeholders::bytes_transferred));
        }
    };

    struct service_wrap {
        service_wrap(int threads) {
            while(threads--)
                _pool.create_thread(boost::bind(&io_service::run, boost::ref(_service)));
        }

        ~service_wrap() {
            _service.post(boost::bind(&service_wrap::stop, this));
            _pool.join_all();
        }

        io_service& service() { return _service; }

    private: // mind the initialization order!
        io_service                        _service;
        boost::optional<io_service::work> _work;
        boost::thread_group               _pool;

        void stop() { 
            _work = boost::none;
        }
    };

    extern void AsioConnectToRemote(int, int, io_service&, socket_t&, bool);
    extern void SetupAsioConnectionsWIthOthers(io_service&, socket_list&, std::string, int, bool);
}

int main()
{
    using namespace AsioTrans;

    // there's no use in increasing #threads unless there are blocking operations
    service_wrap senders(boost::thread::hardware_concurrency()); 
    service_wrap receivers(1);

    socket_t receiver_socket(receivers.service());
    AsioConnectToRemote(5000, 1, receivers.service(), receiver_socket, true);

    socket_list send_sockets(30);
    /*hadoopNodes =*/ SetupAsioConnectionsWIthOthers(senders.service(), send_sockets, "hostFileName", 3000, false);

    int mpi_rank = send_sockets.size();
    AsioTrans::Broadcaster db_receiver(receiver_socket, send_sockets, mpi_rank, mpi_rank);
    db_receiver.broadcast();
}