C Pthreads-线程安全队列实现的问题_C_Multithreading_Thread Safety_Pthreads_Threadpool

C Pthreads-线程安全队列实现的问题

c multithreading

C Pthreads-线程安全队列实现的问题,c,multithreading,thread-safety,pthreads,threadpool,C,Multithreading,Thread Safety,Pthreads,Threadpool,我是多线程新手，我正在尝试实现一个简单的线程安全任务队列，其中每个线程都可以从中提取工作，直到没有更多的任务剩下。任何线程都不会对任务进行排队出于测试目的，每个任务都只包含一个数字 static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER; typedef struct Task{ int number; }Task; typedef struct Cell{

我是多线程新手，我正在尝试实现一个简单的线程安全任务队列，其中每个线程都可以从中提取工作，直到没有更多的任务剩下。任何线程都不会对任务进行排队

出于测试目的，每个任务都只包含一个数字

    static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER;

    typedef struct Task{
       int number;
    }Task;


    typedef struct Cell{
        Task t;
        struct Cell* next;
    }Cell;


    typedef struct TQueue{
        struct Cell* head;
        struct Cell* tail;
    }TQueue;



   int empty(TQueue *Queue) 
      return queue->head == queue->tail;


   void startQueue(TQueue *queue){

        queue->head = malloc(sizeof(Cell));
        queue->tail = queue->head;
   }

   void enqueue(TQueue *queue, Task C){

       queue->tail->next = malloc(sizeof(Cell));
       queue->tail = queue->tail->next;
       queue->tail->t = C;
       queue->tail->next = NULL; 
   }


    Task * dequeue(TQueue* queue){

       pthread_mutex_lock( &task_mutex);
       Task * t;

       if(empty(queue)) t = NULL;

       else{

           struct Cell* p = queue->head;
           queue->head = queue->head->next;
           t = &queue->head->t;
           free(p);
       }

       pthread_mutex_unlock( &task_mutex);
       return t;
    }

    void * work( void* arg){

       TQueue* queue = (TQueue *)arg;
       Task* t = malloc(sizeof(Task));

       for(t = dequeue(queue); t != NULL; t = dequeue(queue))
           printf("%d ", t->number);

       free(t);
       pthread_exit(NULL);
       return 0;
    }

对于一个简单的测试，我在main上运行了以下内容：

int main(){

    TQueue* queue = malloc(sizeof(TQueue));
    startQueue(queue);

    pthread_t threads[3];
    Task t[3];


    for(int i = 0; i < 3; i++){
        t[i].number = i + 1;
        enqueue(queue, t[i]);
    }

    for(int i = 0; i < 3; i++) pthread_create(&threads[i], NULL, work, (void*)queue);

    for(int i = 0; i < 3; i++) pthread_join(threads[i], NULL);

    return 0;
}

intmain（）{
TQueue*queue=malloc（sizeof（TQueue））；
startQueue（队列）；
pthread_t线程[3]；
任务t[3]；
对于（int i=0；i<3；i++）{
t[i]。编号=i+1；
排队（排队，t[i]）；
}
对于（inti=0；i<3；i++）pthread_create（&threads[i]，NULL，work，（void*）队列）；
对于（inti=0；i<3；i++）pthread_-join（threads[i]，NULL）；
返回0；
}

预期的输出是任何顺序的

1 2 3

，但有时它会打印一个带有奇怪数字的序列，如

1823219 2 3

。我无法检测到任何比赛条件或相关问题，因此我感谢您的帮助。

我还发现了一些错误

我已经注释了你的代码。我从你的第一个帖子和第二个帖子中得到了一些信息。我已经修改了代码，显示了前后[请原谅这种免费的风格清理]：

#include <stdio.h>
#include <pthread.h>
#include <malloc.h>

static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER;

typedef struct Task {
    int number;
} Task;

typedef struct Cell {
// NOTE/BUG: this should be a pointer to the task. otherwise, dequeue gets
// messy
#if 0
    Task t;
#else
    Task *t;
#endif
    struct Cell *next;
} Cell;

typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

void
startQueue(TQueue *queue)
{

#if 0
    queue->head = malloc(sizeof(Cell));
#else
    queue->head = NULL;
#endif
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    // NOTE/BUG: dequeue never touches tail, so this test is incorrect
#if 0
    return (queue->head == queue->tail);
#else
    return (queue->head == NULL);
#endif
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    pthread_mutex_lock(&task_mutex);

    p = malloc(sizeof(Cell));
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    pthread_mutex_unlock(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    pthread_mutex_lock(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        // NOTE/BUG: this is setting t to the second element in the list,
        // not the first
        // NOTE/BUG: this is also undefined behavior, in original code (with
        // original struct definition), because what t points to _does_ get
        // freed before return
#if 0
        t = &queue->head->t;
#else
        t = p->t;
#endif

        free(p);
    }

    pthread_mutex_unlock(&task_mutex);

    return t;
}

void *
work(void *arg)
{

    TQueue *queue = (TQueue *) arg;

    // NOTE/BUG: this gets orphaned on the first call to dequeue
#if 0
    Task *t = malloc(sizeof(Task));
#else
    Task *t;
#endif

    for (t = dequeue(queue); t != NULL; t = dequeue(queue))
        printf("%d ", t->number);

    // NOTE/BUG: this frees some cell allocated in main -- not what we want
#if 0
    free(t);
#endif

    pthread_exit(NULL);
    return 0;
}

// For a simple test i runned this on main:

int
main()
{

    TQueue *queue = malloc(sizeof(TQueue));

    startQueue(queue);

    pthread_t threads[3];
    Task t[3];

    for (int i = 0; i < 3; i++) {
        t[i].number = i + 1;
#if 0
        enqueue(queue, t);
#else
        enqueue(queue, &t[i]);
#endif
    }

    for (int i = 0; i < 3; i++)
        pthread_create(&threads[i], NULL, work, (void *) queue);

    for (int i = 0; i < 3; i++)
        pthread_join(threads[i], NULL);

    return 0;
}

更新#2:

另外，一个线程（即第一个线程）可以独占队列并在其他线程有机会运行之前耗尽所有条目。”在这种情况下，可以做什么

有几件事

pthread_create

需要一点时间，允许线程1在其他线程仍在创建时运行。改进方法是创建所有线程，每个线程设置一个“我正在运行”标志（在其线程控制块中）。主线程等待所有线程设置该标志。然后，主线程设置一个全局volatile”您可以在每个线程进入主线程循环之前打开“现在运行”标志。根据我的经验，它们都在彼此的微秒（或更好）内开始运行

我没有在下面更新的代码中实现这一点，因此您可以自己试验它[以及

nanosleep

]

总的来说，互斥锁是相当公平的[至少在linux下]，因为阻塞的线程将排队等待互斥锁。正如我在评论中提到的，也可以使用

nanosleep

，但这[在某种程度上]违背了目的，因为线程会变慢

线程匮乏的解药是“公平”。正如我所提到的，有一种精心设计的公平算法，无需等待。它是Kogan/Petrank算法：这确实有点复杂/先进，所以请注意清空

但是，折衷方案可能是锁定票证：

我再次修改了该程序。它有池分配、票证与互斥锁以及延迟打印日志项的选项。它还交叉检查线程之间的结果，以确保它们没有重复的条目

当然，所有这些的关键是精确、高精度的测井（即，如果你不能测量它，你就不能调整它）

例如，有人会认为，在

出列内部执行空闲
比简单地将单元释放到可恢复池（类似于slab分配器）要慢，但是，性能提升没有预期的那么大。这可能是因为glibc的malloc/free
非常快（这就是他们所声称的）
这些不同的版本应该会让您了解如何构建自己的性能度量套件
不管怎样，代码如下：
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <stdatomic.h>
#include <malloc.h>
#include <errno.h>
#include <string.h>
#include <time.h>

int opt_p;                              // print thread output immediately
int opt_T;                              // number of threads
int opt_Q;                              // number of queue items
int opt_L;                              // use ticket lock
int opt_M;                              // use fast cell alloc/free

typedef unsigned char byte;
typedef unsigned int u32;

#define sysfault(_fmt...) \
    do { \
        fprintf(stderr,_fmt); \
        exit(1); \
    } while (0)

// lock control
typedef struct AnyLock {
    pthread_mutex_t mutex;              // standard mutex
    volatile u32 seqreq;                // ticket lock request
    volatile u32 seqacq;                // ticket lock grant
} AnyLock;

// work value
typedef struct Task {
    union {
        struct Task *next;
        int number;
    };
} Task;

// queue item
typedef struct Cell {
    struct Cell *next;
    Task *t;
} Cell;

// queue control
typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

// thread log entry
typedef struct Log {
    double tvbef;
    double tvaft;
    int number;
} Log;

#define BTVOFF(_off) \
    ((_off) >> 3)
#define BTVMSK(_off) \
    (1u << ((_off) & 0x07))

#define BTVLEN(_len) \
    ((_len) + 7) >> 3

// thread control
typedef struct Thread {
    pthread_t tid;
    int xid;
    TQueue *queue;
    Log *log;
    byte *bitv;
} Thread;

static inline byte
btvset(byte *bitv,long off)
{
    u32 msk;
    byte oval;

    bitv += BTVOFF(off);
    msk = BTVMSK(off);

    oval = *bitv & msk;

    *bitv |= msk;

    return oval;
}

AnyLock task_mutex;
AnyLock print_mutex;
double tvzero;
Cell *cellpool;                         // free pool of cells
long bitvlen;

#define BARRIER \
    __asm__ __volatile__("" ::: "memory")

// virtual function pointers
Cell *(*cellnew)(void);
void (*cellfree)(Cell *);
void (*lock_acquire)(AnyLock *lock);
void (*lock_release)(AnyLock *lock);

double
tvgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    sec -= tvzero;

    return sec;
}

void *
xalloc(size_t cnt,size_t siz)
{
    void *ptr;

    ptr = calloc(cnt,siz);
    if (ptr == NULL)
        sysfault("xalloc: calloc failure -- %s\n",strerror(errno));

    return ptr;
}

void
lock_wait_ticket(AnyLock *lock,u32 newval)
{
    u32 oldval;

    // wait for our ticket to come up
    // NOTE: atomic_load is [probably] overkill here
    while (1) {
#if 0
        oldval = atomic_load(&lock->seqacq);
#else
        oldval = lock->seqacq;
#endif
        if (oldval == newval)
            break;
    }
}

void
lock_acquire_ticket(AnyLock *lock)
{
    u32 oldval;
    u32 newval;
    int ok;

    // acquire our ticket value
    // NOTE: just use a garbage value for oldval -- the exchange will
    // update it with the correct/latest value -- this saves a separate
    // refetch within the loop
    oldval = 0;
    while (1) {
#if 0
        BARRIER;
        oldval = lock->seqreq;
#endif
        newval = oldval + 1;
        ok = atomic_compare_exchange_strong(&lock->seqreq,&oldval,newval);
        if (ok)
            break;
    }

    lock_wait_ticket(lock,newval);
}

void
lock_release_ticket(AnyLock *lock)
{

    // NOTE: atomic_fetch_add is [probably] overkill, but leave it for now
#if 1
    atomic_fetch_add(&lock->seqacq,1);
#else
    lock->seqacq += 1;
#endif
}

void
lock_acquire_mutex(AnyLock *lock)
{

    pthread_mutex_lock(&lock->mutex);
}

void
lock_release_mutex(AnyLock *lock)
{

    pthread_mutex_unlock(&lock->mutex);
}

void
lock_init(AnyLock *lock)
{

    switch (opt_L) {
    case 1:
        lock->seqreq = 0;
        lock->seqacq = 1;
        lock_acquire = lock_acquire_ticket;
        lock_release = lock_release_ticket;
        break;

    default:
        pthread_mutex_init(&lock->mutex,NULL);
        lock_acquire = lock_acquire_mutex;
        lock_release = lock_release_mutex;
        break;
    }
}

void
startQueue(TQueue *queue)
{

    queue->head = NULL;
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    return (queue->head == NULL);
}

// cellnew_pool -- allocate a queue entry
Cell *
cellnew_pool(void)
{
    int cnt;
    Cell *p;
    Cell *pool;

    while (1) {
        // try for quick allocation
        p = cellpool;

        // bug out if we got it
        if (p != NULL) {
            cellpool = p->next;
            break;
        }

        // go to the heap to replenish the pool
        cnt = 1000;
        p = xalloc(cnt,sizeof(Cell));

        // link up the entries
        pool = NULL;
        for (;  cnt > 0;  --cnt, ++p) {
            p->next = pool;
            pool = p;
        }

        // put this "online"
        cellpool = pool;
    }

    return p;
}

// cellfree_pool -- release a queue entry
void
cellfree_pool(Cell *p)
{

    p->next = cellpool;
    cellpool = p;
}

// cellnew_std -- allocate a queue entry
Cell *
cellnew_std(void)
{
    Cell *p;

    p = xalloc(1,sizeof(Cell));

    return p;
}

// cellfree_std -- release a queue entry
void
cellfree_std(Cell *p)
{

    free(p);
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    lock_acquire(&task_mutex);

    p = cellnew();
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    lock_release(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    lock_acquire(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        t = p->t;

        cellfree(p);
    }

    lock_release(&task_mutex);

    return t;
}

void *
work(void *arg)
{
    Thread *tskcur = arg;
    TQueue *queue = tskcur->queue;
    Task *t;
    Log *log;
    long cnt;
    int tprev;
    byte *bitv;
    double tvbeg;
    double tvbef;
    double tvaft;

    log = tskcur->log;
    bitv = tskcur->bitv;
    tvbeg = tvgetf();

    tprev = 0;
    while (1) {
        tvbef = tvgetf();
        t = dequeue(queue);
        tvaft = tvgetf();

        if (t == NULL)
            break;

        // abort if we get a double entry
        if (btvset(bitv,t->number))
            sysfault("work: duplicate\n");

        if (opt_p) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                tvbef,tvaft - tvbef,tskcur->xid,t->number,t->number - tprev);
            tprev = t->number;
            continue;
        }

        log->tvbef = tvbef;
        log->tvaft = tvaft;
        log->number = t->number;
        ++log;
    }

    if (! opt_p) {
        tvaft = tvgetf();

        cnt = log - tskcur->log;
        log = tskcur->log;

        lock_acquire(&print_mutex);

        printf("\n");
        printf("THREAD=%5.5d START=%.9f STOP=%.9f ELAP=%.9f TOTAL=%ld\n",
            tskcur->xid,tvbeg,tvaft,tvaft - tvbeg,cnt);

        tprev = 0;
        for (;  cnt > 0;  --cnt, ++log) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                log->tvbef,log->tvaft - log->tvbef,tskcur->xid,
                log->number,log->number - tprev);
            tprev = log->number;
        }

        lock_release(&print_mutex);
    }

    return (void *) 0;
}

void
btvchk(Thread *tska,Thread *tskb)
{
    byte *btva;
    byte *btvb;
    byte aval;
    byte bval;
    int idx;

    printf("btvchk: %d ??? %d\n",tska->xid,tskb->xid);

    btva = tska->bitv;
    btvb = tskb->bitv;

    // abort if we get overlapping entries between two threads
    for (idx = 0;  idx < bitvlen;  ++idx) {
        aval = btva[idx];
        bval = btvb[idx];
        if (aval & bval)
            sysfault("btvchk: duplicate\n");
    }
}

// For a simple test i runned this on main:

int
main(int argc,char **argv)
{
    char *cp;
    TQueue *queue;
    Task *t;
    Thread *tsk;

    --argc;
    ++argv;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'p':  // print immediately
            opt_p = 1;
            break;

        case 'Q':  // number of queue items
            opt_Q = atoi(cp + 2);
            break;

        case 'T':  // number of threads
            opt_T = atoi(cp + 2);
            break;

        case 'L':
            opt_L = 1;
            break;

        case 'M':
            opt_M = 1;
            break;

        default:
            break;
        }
    }

    printf("p=%d -- thread log is %s\n",opt_p,opt_p ? "immediate" : "deferred");

    if (opt_T == 0)
        opt_T = 16;
    printf("T=%d (number of threads)\n",opt_T);

    if (opt_Q == 0)
        opt_Q = 1000000;
    printf("Q=%d (number of items to enqueue)\n",opt_Q);

    printf("L=%d -- lock is %s\n",opt_L,opt_L ? "ticket" : "mutex");
    printf("M=%d -- queue item allocation is %s\n",
        opt_M,opt_M ? "pooled" : "malloc/free");

    tvzero = tvgetf();

    lock_init(&task_mutex);
    lock_init(&print_mutex);

    // select queue item allocation strategy
    switch (opt_M) {
    case 1:
        cellnew = cellnew_pool;
        cellfree = cellfree_pool;
        break;

    default:
        cellnew = cellnew_std;
        cellfree = cellfree_std;
        break;
    }

    queue = xalloc(1,sizeof(TQueue));
    startQueue(queue);

    Thread threads[opt_T];

    // get byte length of bit vectors
    bitvlen = BTVLEN(opt_Q + 1);

    // allocate per-thread log buffers
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        if (! opt_p)
            tsk->log = xalloc(opt_Q,sizeof(Log));
        tsk->bitv = xalloc(bitvlen,sizeof(byte));
    }

    // allocate "work to do"
    t = xalloc(opt_Q,sizeof(Task));

    // add to master queue
    for (int i = 0; i < opt_Q; i++) {
        t[i].number = i + 1;
        enqueue(queue, &t[i]);
    }

    // fire up the threads
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        tsk->xid = i + 1;
        tsk->queue = queue;
        pthread_create(&tsk->tid, NULL, work, tsk);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        pthread_join(tsk->tid, NULL);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        for (int j = i + 1; j < opt_T; j++)
            btvchk(&threads[i],&threads[j]);
    }

    printf("TOTAL: %.9f\n",tvgetf());

    free(t);

    return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
int opt_p；//立即打印线程输出
int opt_T；//线程数
int opt_Q；//队列项目数
int opt_L；//使用票据锁
int opt_M；//使用快速单元alloc/free
typedef无符号字符字节；
typedef无符号整数u32；
#定义系统故障（_fmt…）\
做{\
fprintf（标准，fmt）\
出口（1）\
}而（0）
//锁定控制
typedef结构AnyLock{
pthread\u mutex\u t mutex；//标准mutex
volatile u32 seqreq；//票据锁定请求
volatile u32 seqacq；//票据锁授予
}任意锁；
//工作价值
类型定义结构任务{
联合{
结构任务*下一步；
整数；
};
}任务；
//队列项目
类型定义结构单元{
结构单元*下一步；
任务*t；
}细胞；
//队列控制
类型定义结构TQueue{
结构单元*头部；
结构单元*尾部；
}特奎厄；
//线程日志条目
类型定义结构日志{
双tvbef；
双tvaft；
整数；
}日志；
#定义BTVOFF（_off）\
（（_off）>>3）
#定义BTVMSK（_off）\
（1u>3
//线程控制
typedef结构线程{
pthread_t tid；
int-xid；
队列；
Log*Log；
字节*bitv；
}螺纹；
静态内联字节
btvset（字节*位，长关）
{
u32 msk；
字节椭圆形；
bitv+=BTVOFF（关闭）；
msk=BTVMSK（关闭）；
椭圆形=*比特电视和msk；
*bitv |=msk；
返回椭圆形；
}
任意锁任务\u互斥体；
任意锁打印互斥；
双tvzero；
Cell*cellpool；//可用的单元格池
长比夫伦；
#定义屏障\
__asm\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
//虚拟函数指针
单元格*（*新单元格）（无效）；
无效（*无单元格）（单元格*）；
无效（*lock_acquire）（AnyLock*lock）；
无效（*锁定和释放）（任意锁定*锁定）；
双重的
tvgetf（无效）
{
结构timespects；
双秒；
时钟获取时间（时钟实时，&ts）；
sec=ts.tv\u nsec；
sec/=1e9；
秒+=ts.tv_秒；
sec-=tvzero；
返回秒；
}
空虚*
xalloc（尺寸cnt，尺寸siz）
{
无效*ptr；
ptr=calloc（cnt，siz）；
如果（ptr==NULL）
sysfault（“xalloc:calloc失败--%s\n”，strerror（errno））；
返回ptr；
}
无效的
锁定等待票（任意锁定*锁定，u32新值）
{
u32 oldval；
//等我们的票上来
//注：原子
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <stdatomic.h>
#include <malloc.h>
#include <errno.h>
#include <string.h>
#include <time.h>

int opt_p;                              // print thread output immediately
int opt_T;                              // number of threads
int opt_Q;                              // number of queue items
int opt_L;                              // use ticket lock
int opt_M;                              // use fast cell alloc/free

typedef unsigned char byte;
typedef unsigned int u32;

#define sysfault(_fmt...) \
    do { \
        fprintf(stderr,_fmt); \
        exit(1); \
    } while (0)

// lock control
typedef struct AnyLock {
    pthread_mutex_t mutex;              // standard mutex
    volatile u32 seqreq;                // ticket lock request
    volatile u32 seqacq;                // ticket lock grant
} AnyLock;

// work value
typedef struct Task {
    union {
        struct Task *next;
        int number;
    };
} Task;

// queue item
typedef struct Cell {
    struct Cell *next;
    Task *t;
} Cell;

// queue control
typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

// thread log entry
typedef struct Log {
    double tvbef;
    double tvaft;
    int number;
} Log;

#define BTVOFF(_off) \
    ((_off) >> 3)
#define BTVMSK(_off) \
    (1u << ((_off) & 0x07))

#define BTVLEN(_len) \
    ((_len) + 7) >> 3

// thread control
typedef struct Thread {
    pthread_t tid;
    int xid;
    TQueue *queue;
    Log *log;
    byte *bitv;
} Thread;

static inline byte
btvset(byte *bitv,long off)
{
    u32 msk;
    byte oval;

    bitv += BTVOFF(off);
    msk = BTVMSK(off);

    oval = *bitv & msk;

    *bitv |= msk;

    return oval;
}

AnyLock task_mutex;
AnyLock print_mutex;
double tvzero;
Cell *cellpool;                         // free pool of cells
long bitvlen;

#define BARRIER \
    __asm__ __volatile__("" ::: "memory")

// virtual function pointers
Cell *(*cellnew)(void);
void (*cellfree)(Cell *);
void (*lock_acquire)(AnyLock *lock);
void (*lock_release)(AnyLock *lock);

double
tvgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    sec -= tvzero;

    return sec;
}

void *
xalloc(size_t cnt,size_t siz)
{
    void *ptr;

    ptr = calloc(cnt,siz);
    if (ptr == NULL)
        sysfault("xalloc: calloc failure -- %s\n",strerror(errno));

    return ptr;
}

void
lock_wait_ticket(AnyLock *lock,u32 newval)
{
    u32 oldval;

    // wait for our ticket to come up
    // NOTE: atomic_load is [probably] overkill here
    while (1) {
#if 0
        oldval = atomic_load(&lock->seqacq);
#else
        oldval = lock->seqacq;
#endif
        if (oldval == newval)
            break;
    }
}

void
lock_acquire_ticket(AnyLock *lock)
{
    u32 oldval;
    u32 newval;
    int ok;

    // acquire our ticket value
    // NOTE: just use a garbage value for oldval -- the exchange will
    // update it with the correct/latest value -- this saves a separate
    // refetch within the loop
    oldval = 0;
    while (1) {
#if 0
        BARRIER;
        oldval = lock->seqreq;
#endif
        newval = oldval + 1;
        ok = atomic_compare_exchange_strong(&lock->seqreq,&oldval,newval);
        if (ok)
            break;
    }

    lock_wait_ticket(lock,newval);
}

void
lock_release_ticket(AnyLock *lock)
{

    // NOTE: atomic_fetch_add is [probably] overkill, but leave it for now
#if 1
    atomic_fetch_add(&lock->seqacq,1);
#else
    lock->seqacq += 1;
#endif
}

void
lock_acquire_mutex(AnyLock *lock)
{

    pthread_mutex_lock(&lock->mutex);
}

void
lock_release_mutex(AnyLock *lock)
{

    pthread_mutex_unlock(&lock->mutex);
}

void
lock_init(AnyLock *lock)
{

    switch (opt_L) {
    case 1:
        lock->seqreq = 0;
        lock->seqacq = 1;
        lock_acquire = lock_acquire_ticket;
        lock_release = lock_release_ticket;
        break;

    default:
        pthread_mutex_init(&lock->mutex,NULL);
        lock_acquire = lock_acquire_mutex;
        lock_release = lock_release_mutex;
        break;
    }
}

void
startQueue(TQueue *queue)
{

    queue->head = NULL;
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    return (queue->head == NULL);
}

// cellnew_pool -- allocate a queue entry
Cell *
cellnew_pool(void)
{
    int cnt;
    Cell *p;
    Cell *pool;

    while (1) {
        // try for quick allocation
        p = cellpool;

        // bug out if we got it
        if (p != NULL) {
            cellpool = p->next;
            break;
        }

        // go to the heap to replenish the pool
        cnt = 1000;
        p = xalloc(cnt,sizeof(Cell));

        // link up the entries
        pool = NULL;
        for (;  cnt > 0;  --cnt, ++p) {
            p->next = pool;
            pool = p;
        }

        // put this "online"
        cellpool = pool;
    }

    return p;
}

// cellfree_pool -- release a queue entry
void
cellfree_pool(Cell *p)
{

    p->next = cellpool;
    cellpool = p;
}

// cellnew_std -- allocate a queue entry
Cell *
cellnew_std(void)
{
    Cell *p;

    p = xalloc(1,sizeof(Cell));

    return p;
}

// cellfree_std -- release a queue entry
void
cellfree_std(Cell *p)
{

    free(p);
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    lock_acquire(&task_mutex);

    p = cellnew();
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    lock_release(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    lock_acquire(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        t = p->t;

        cellfree(p);
    }

    lock_release(&task_mutex);

    return t;
}

void *
work(void *arg)
{
    Thread *tskcur = arg;
    TQueue *queue = tskcur->queue;
    Task *t;
    Log *log;
    long cnt;
    int tprev;
    byte *bitv;
    double tvbeg;
    double tvbef;
    double tvaft;

    log = tskcur->log;
    bitv = tskcur->bitv;
    tvbeg = tvgetf();

    tprev = 0;
    while (1) {
        tvbef = tvgetf();
        t = dequeue(queue);
        tvaft = tvgetf();

        if (t == NULL)
            break;

        // abort if we get a double entry
        if (btvset(bitv,t->number))
            sysfault("work: duplicate\n");

        if (opt_p) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                tvbef,tvaft - tvbef,tskcur->xid,t->number,t->number - tprev);
            tprev = t->number;
            continue;
        }

        log->tvbef = tvbef;
        log->tvaft = tvaft;
        log->number = t->number;
        ++log;
    }

    if (! opt_p) {
        tvaft = tvgetf();

        cnt = log - tskcur->log;
        log = tskcur->log;

        lock_acquire(&print_mutex);

        printf("\n");
        printf("THREAD=%5.5d START=%.9f STOP=%.9f ELAP=%.9f TOTAL=%ld\n",
            tskcur->xid,tvbeg,tvaft,tvaft - tvbeg,cnt);

        tprev = 0;
        for (;  cnt > 0;  --cnt, ++log) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                log->tvbef,log->tvaft - log->tvbef,tskcur->xid,
                log->number,log->number - tprev);
            tprev = log->number;
        }

        lock_release(&print_mutex);
    }

    return (void *) 0;
}

void
btvchk(Thread *tska,Thread *tskb)
{
    byte *btva;
    byte *btvb;
    byte aval;
    byte bval;
    int idx;

    printf("btvchk: %d ??? %d\n",tska->xid,tskb->xid);

    btva = tska->bitv;
    btvb = tskb->bitv;

    // abort if we get overlapping entries between two threads
    for (idx = 0;  idx < bitvlen;  ++idx) {
        aval = btva[idx];
        bval = btvb[idx];
        if (aval & bval)
            sysfault("btvchk: duplicate\n");
    }
}

// For a simple test i runned this on main:

int
main(int argc,char **argv)
{
    char *cp;
    TQueue *queue;
    Task *t;
    Thread *tsk;

    --argc;
    ++argv;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'p':  // print immediately
            opt_p = 1;
            break;

        case 'Q':  // number of queue items
            opt_Q = atoi(cp + 2);
            break;

        case 'T':  // number of threads
            opt_T = atoi(cp + 2);
            break;

        case 'L':
            opt_L = 1;
            break;

        case 'M':
            opt_M = 1;
            break;

        default:
            break;
        }
    }

    printf("p=%d -- thread log is %s\n",opt_p,opt_p ? "immediate" : "deferred");

    if (opt_T == 0)
        opt_T = 16;
    printf("T=%d (number of threads)\n",opt_T);

    if (opt_Q == 0)
        opt_Q = 1000000;
    printf("Q=%d (number of items to enqueue)\n",opt_Q);

    printf("L=%d -- lock is %s\n",opt_L,opt_L ? "ticket" : "mutex");
    printf("M=%d -- queue item allocation is %s\n",
        opt_M,opt_M ? "pooled" : "malloc/free");

    tvzero = tvgetf();

    lock_init(&task_mutex);
    lock_init(&print_mutex);

    // select queue item allocation strategy
    switch (opt_M) {
    case 1:
        cellnew = cellnew_pool;
        cellfree = cellfree_pool;
        break;

    default:
        cellnew = cellnew_std;
        cellfree = cellfree_std;
        break;
    }

    queue = xalloc(1,sizeof(TQueue));
    startQueue(queue);

    Thread threads[opt_T];

    // get byte length of bit vectors
    bitvlen = BTVLEN(opt_Q + 1);

    // allocate per-thread log buffers
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        if (! opt_p)
            tsk->log = xalloc(opt_Q,sizeof(Log));
        tsk->bitv = xalloc(bitvlen,sizeof(byte));
    }

    // allocate "work to do"
    t = xalloc(opt_Q,sizeof(Task));

    // add to master queue
    for (int i = 0; i < opt_Q; i++) {
        t[i].number = i + 1;
        enqueue(queue, &t[i]);
    }

    // fire up the threads
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        tsk->xid = i + 1;
        tsk->queue = queue;
        pthread_create(&tsk->tid, NULL, work, tsk);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        pthread_join(tsk->tid, NULL);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        for (int j = i + 1; j < opt_T; j++)
            btvchk(&threads[i],&threads[j]);
    }

    printf("TOTAL: %.9f\n",tvgetf());

    free(t);

    return 0;
}