c multithreading thread-safety pthreads threadpool

C Pthreads - issues with thread-safe queue implementation

I'm new to multithreading and im trying to implement a simple thread safe queue of tasks where each thread can pull work from until there's no more tasks left. No queuing of tasks will be made by any of the threads.

For testing purposeses every Task holds just a number.

    static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER;

    typedef struct Task{
       int number;
    }Task;


    typedef struct Cell{
        Task t;
        struct Cell* next;
    }Cell;


    typedef struct TQueue{
        struct Cell* head;
        struct Cell* tail;
    }TQueue;



   int empty(TQueue *Queue) 
      return queue->head == queue->tail;


   void startQueue(TQueue *queue){

        queue->head = malloc(sizeof(Cell));
        queue->tail = queue->head;
   }

   void enqueue(TQueue *queue, Task C){

       queue->tail->next = malloc(sizeof(Cell));
       queue->tail = queue->tail->next;
       queue->tail->t = C;
       queue->tail->next = NULL; 
   }


    Task * dequeue(TQueue* queue){

       pthread_mutex_lock( &task_mutex);
       Task * t;

       if(empty(queue)) t = NULL;

       else{

           struct Cell* p = queue->head;
           queue->head = queue->head->next;
           t = &queue->head->t;
           free(p);
       }

       pthread_mutex_unlock( &task_mutex);
       return t;
    }

    void * work( void* arg){

       TQueue* queue = (TQueue *)arg;
       Task* t = malloc(sizeof(Task));

       for(t = dequeue(queue); t != NULL; t = dequeue(queue))
           printf("%d ", t->number);

       free(t);
       pthread_exit(NULL);
       return 0;
    }

For a simple test i runned this on main:

int main(){

    TQueue* queue = malloc(sizeof(TQueue));
    startQueue(queue);

    pthread_t threads[3];
    Task t[3];


    for(int i = 0; i < 3; i++){
        t[i].number = i + 1;
        enqueue(queue, t[i]);
    }

    for(int i = 0; i < 3; i++) pthread_create(&threads[i], NULL, work, (void*)queue);

    for(int i = 0; i < 3; i++) pthread_join(threads[i], NULL);

    return 0;
}

The expected output was 1 2 3 in any order, but sometimes it prints a sequence with a strange number in it like 1823219 2 3. I have not been able to detect any race conditions or related problems, so i appreciate any help.

Solution

I found a few more bugs.

I've annotated your code. I took a bit from your first posting and your second. I've fixed the code, showing before and after [please pardon the gratuitous style cleanup]:

#include <stdio.h>
#include <pthread.h>
#include <malloc.h>

static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER;

typedef struct Task {
    int number;
} Task;

typedef struct Cell {
// NOTE/BUG: this should be a pointer to the task. otherwise, dequeue gets
// messy
#if 0
    Task t;
#else
    Task *t;
#endif
    struct Cell *next;
} Cell;

typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

void
startQueue(TQueue *queue)
{

#if 0
    queue->head = malloc(sizeof(Cell));
#else
    queue->head = NULL;
#endif
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    // NOTE/BUG: dequeue never touches tail, so this test is incorrect
#if 0
    return (queue->head == queue->tail);
#else
    return (queue->head == NULL);
#endif
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    pthread_mutex_lock(&task_mutex);

    p = malloc(sizeof(Cell));
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    pthread_mutex_unlock(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    pthread_mutex_lock(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        // NOTE/BUG: this is setting t to the second element in the list,
        // not the first
        // NOTE/BUG: this is also undefined behavior, in original code (with
        // original struct definition), because what t points to _does_ get
        // freed before return
#if 0
        t = &queue->head->t;
#else
        t = p->t;
#endif

        free(p);
    }

    pthread_mutex_unlock(&task_mutex);

    return t;
}

void *
work(void *arg)
{

    TQueue *queue = (TQueue *) arg;

    // NOTE/BUG: this gets orphaned on the first call to dequeue
#if 0
    Task *t = malloc(sizeof(Task));
#else
    Task *t;
#endif

    for (t = dequeue(queue); t != NULL; t = dequeue(queue))
        printf("%d ", t->number);

    // NOTE/BUG: this frees some cell allocated in main -- not what we want
#if 0
    free(t);
#endif

    pthread_exit(NULL);
    return 0;
}

// For a simple test i runned this on main:

int
main()
{

    TQueue *queue = malloc(sizeof(TQueue));

    startQueue(queue);

    pthread_t threads[3];
    Task t[3];

    for (int i = 0; i < 3; i++) {
        t[i].number = i + 1;
#if 0
        enqueue(queue, t);
#else
        enqueue(queue, &t[i]);
#endif
    }

    for (int i = 0; i < 3; i++)
        pthread_create(&threads[i], NULL, work, (void *) queue);

    for (int i = 0; i < 3; i++)
        pthread_join(threads[i], NULL);

    return 0;
}

UPDATE:

Are the threads executing the tasks concurrently ? I've been testing the cpu usage with htop and i can only max the usage of a single core out of four.

A few things to keep in mind. htop probably won't show much on programs that have such a short running time. Even with 10,000 queue entries this program executes in 20ms.

It's better to have the program itself print the information [see below]. Note that printf does thread locking on stdin so it may contribute to the "serial" nature of the program. It also contributes a significant amount to the execution time of the program (i.e. the printf is much slower than the dequeue)

Also, one thread (i.e. the first one) could monopolize the queue and drain all entries before the others have a chance to run.

The OS may [is at liberty to] schedule all threads on a single core. It may then "migrate" them later (e.g. within a second or so).

I've enhanced the program to include some timing information in the output print that may help show more of what you'd like to see. Also, I've added command line options to control the number of threads and number of items queued. This is similar to what I do for some of my own programs. Divert the program output to a log file and examine it. Play around with the options on multiple runs

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <malloc.h>
#include <time.h>

int opt_n;                              // suppress thread output
int opt_T;                              // number of threads
int opt_Q;                              // number of queue items

static pthread_mutex_t task_mutex = PTHREAD_MUTEX_INITIALIZER;

double tvzero;

typedef struct Task {
    int number;
} Task;

typedef struct Cell {
    Task *t;
    struct Cell *next;
} Cell;

typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

typedef struct Thread {
    pthread_t tid;
    int xid;
    TQueue *queue;
} Thread;

double
tvgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    sec -= tvzero;

    return sec;
}

void
startQueue(TQueue *queue)
{

    queue->head = NULL;
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    return (queue->head == NULL);
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    pthread_mutex_lock(&task_mutex);

    p = malloc(sizeof(Cell));
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    pthread_mutex_unlock(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    pthread_mutex_lock(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        t = p->t;

        free(p);
    }

    pthread_mutex_unlock(&task_mutex);

    return t;
}

void *
work(void *arg)
{
    Thread *tskcur = arg;
    TQueue *queue = tskcur->queue;
    Task *t;
    double tvbef;
    double tvaft;

    while (1) {
        tvbef = tvgetf();
        t = dequeue(queue);
        tvaft = tvgetf();

        if (t == NULL)
            break;

        if (! opt_n)
            printf("[%.9f/%.9f %5.5d] %d\n",
                tvbef,tvaft - tvbef,tskcur->xid,t->number);
    }

    return (void *) 0;
}

// For a simple test i runned this on main:

int
main(int argc,char **argv)
{
    char *cp;
    TQueue *queue;
    Task *t;
    Thread *tsk;

    --argc;
    ++argv;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'n':  // suppress thread output
            opt_n = 1;
            break;

        case 'Q':  // number of queue items
            opt_Q = atoi(cp + 2);
            break;

        case 'T':  // number of threads
            opt_T = atoi(cp + 2);
            break;

        default:
            break;
        }
    }

    tvzero = tvgetf();

    queue = malloc(sizeof(TQueue));
    startQueue(queue);

    if (opt_T == 0)
        opt_T = 16;
    Thread threads[opt_T];

    if (opt_Q == 0)
        opt_Q = 10000;
    t = malloc(sizeof(Task) * opt_Q);

    for (int i = 0; i < opt_Q; i++) {
        t[i].number = i + 1;
        enqueue(queue, &t[i]);
    }

    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        tsk->xid = i + 1;
        tsk->queue = queue;
        pthread_create(&tsk->tid, NULL, work, tsk);
    }

    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        pthread_join(tsk->tid, NULL);
    }

    printf("TOTAL: %.9f\n",tvgetf());

    free(t);

    return 0;
}

UPDATE #2:

Also, one thread (i.e. the first one) could monopolize the queue and drain all entries before the others have a chance to run." What can be done in that case ?

A few things.

pthread_create takes a bit of time, allowing thread 1 to go while the others are still being created. A way to ameliorate this is to create all threads, each thread sets an "I am running" flag (in its thread control block). The main thread waits for all threads to set this flag. Then, the main thread sets a global volatile "you_may_now_all_run" flag that each thread spins on before entering its primary thread loop. In my experience, they all start running within microseconds of each other [or better].

I didn't implement this in the updated code below, so you can experiment with it yourself [along with the nanosleep].

mutexes are pretty fair overall [under linux, at least] because a blocked thread will get queued, waiting on the mutex. As I mentioned in the comments, a nanosleep can also be used, but this [somewhat] defeats the purpose as the threads will slow down.

The antidote to thread starvation is "fairness". As I mentioned, there is an elaborate algorithm for fairness without waiting. It is the Kogan/Petrank algorithm: http://www.cs.technion.ac.il/~erez/Papers/wf-methodology-ppopp12.pdf This is really a bit involved/advanced, so caveat emptor ...

However, a compromise may be a ticket lock: https://en.wikipedia.org/wiki/Ticket_lock

I've reworked the program again. It has options for pooled allocation, ticket vs. mutex lock, and deferred printing of log entries. It also cross-checks the results between threads to ensure none of them got duplicate entries.

Of course, the key to all this is accurate, high precision logging (i.e. if you can't measure it, you can't tune it).

For example, one would think that doing free inside dequeue would be slower than simply releasing the Cell to a resuable pool (similar to a slab allocator), but, the performance boost wasn't as great as expected. This could be that glibc's malloc/free is just blazing fast [which is what they claim].

These various versions should give you some ideas of how to build up your own performance measurement suite.

Anyway, here's the code:

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <stdatomic.h>
#include <malloc.h>
#include <errno.h>
#include <string.h>
#include <time.h>

int opt_p;                              // print thread output immediately
int opt_T;                              // number of threads
int opt_Q;                              // number of queue items
int opt_L;                              // use ticket lock
int opt_M;                              // use fast cell alloc/free

typedef unsigned char byte;
typedef unsigned int u32;

#define sysfault(_fmt...) \
    do { \
        fprintf(stderr,_fmt); \
        exit(1); \
    } while (0)

// lock control
typedef struct AnyLock {
    pthread_mutex_t mutex;              // standard mutex
    volatile u32 seqreq;                // ticket lock request
    volatile u32 seqacq;                // ticket lock grant
} AnyLock;

// work value
typedef struct Task {
    union {
        struct Task *next;
        int number;
    };
} Task;

// queue item
typedef struct Cell {
    struct Cell *next;
    Task *t;
} Cell;

// queue control
typedef struct TQueue {
    struct Cell *head;
    struct Cell *tail;
} TQueue;

// thread log entry
typedef struct Log {
    double tvbef;
    double tvaft;
    int number;
} Log;

#define BTVOFF(_off) \
    ((_off) >> 3)
#define BTVMSK(_off) \
    (1u << ((_off) & 0x07))

#define BTVLEN(_len) \
    ((_len) + 7) >> 3

// thread control
typedef struct Thread {
    pthread_t tid;
    int xid;
    TQueue *queue;
    Log *log;
    byte *bitv;
} Thread;

static inline byte
btvset(byte *bitv,long off)
{
    u32 msk;
    byte oval;

    bitv += BTVOFF(off);
    msk = BTVMSK(off);

    oval = *bitv & msk;

    *bitv |= msk;

    return oval;
}

AnyLock task_mutex;
AnyLock print_mutex;
double tvzero;
Cell *cellpool;                         // free pool of cells
long bitvlen;

#define BARRIER \
    __asm__ __volatile__("" ::: "memory")

// virtual function pointers
Cell *(*cellnew)(void);
void (*cellfree)(Cell *);
void (*lock_acquire)(AnyLock *lock);
void (*lock_release)(AnyLock *lock);

double
tvgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    sec -= tvzero;

    return sec;
}

void *
xalloc(size_t cnt,size_t siz)
{
    void *ptr;

    ptr = calloc(cnt,siz);
    if (ptr == NULL)
        sysfault("xalloc: calloc failure -- %s\n",strerror(errno));

    return ptr;
}

void
lock_wait_ticket(AnyLock *lock,u32 newval)
{
    u32 oldval;

    // wait for our ticket to come up
    // NOTE: atomic_load is [probably] overkill here
    while (1) {
#if 0
        oldval = atomic_load(&lock->seqacq);
#else
        oldval = lock->seqacq;
#endif
        if (oldval == newval)
            break;
    }
}

void
lock_acquire_ticket(AnyLock *lock)
{
    u32 oldval;
    u32 newval;
    int ok;

    // acquire our ticket value
    // NOTE: just use a garbage value for oldval -- the exchange will
    // update it with the correct/latest value -- this saves a separate
    // refetch within the loop
    oldval = 0;
    while (1) {
#if 0
        BARRIER;
        oldval = lock->seqreq;
#endif
        newval = oldval + 1;
        ok = atomic_compare_exchange_strong(&lock->seqreq,&oldval,newval);
        if (ok)
            break;
    }

    lock_wait_ticket(lock,newval);
}

void
lock_release_ticket(AnyLock *lock)
{

    // NOTE: atomic_fetch_add is [probably] overkill, but leave it for now
#if 1
    atomic_fetch_add(&lock->seqacq,1);
#else
    lock->seqacq += 1;
#endif
}

void
lock_acquire_mutex(AnyLock *lock)
{

    pthread_mutex_lock(&lock->mutex);
}

void
lock_release_mutex(AnyLock *lock)
{

    pthread_mutex_unlock(&lock->mutex);
}

void
lock_init(AnyLock *lock)
{

    switch (opt_L) {
    case 1:
        lock->seqreq = 0;
        lock->seqacq = 1;
        lock_acquire = lock_acquire_ticket;
        lock_release = lock_release_ticket;
        break;

    default:
        pthread_mutex_init(&lock->mutex,NULL);
        lock_acquire = lock_acquire_mutex;
        lock_release = lock_release_mutex;
        break;
    }
}

void
startQueue(TQueue *queue)
{

    queue->head = NULL;
    queue->tail = NULL;
}

int
empty(TQueue *queue)
{

    return (queue->head == NULL);
}

// cellnew_pool -- allocate a queue entry
Cell *
cellnew_pool(void)
{
    int cnt;
    Cell *p;
    Cell *pool;

    while (1) {
        // try for quick allocation
        p = cellpool;

        // bug out if we got it
        if (p != NULL) {
            cellpool = p->next;
            break;
        }

        // go to the heap to replenish the pool
        cnt = 1000;
        p = xalloc(cnt,sizeof(Cell));

        // link up the entries
        pool = NULL;
        for (;  cnt > 0;  --cnt, ++p) {
            p->next = pool;
            pool = p;
        }

        // put this "online"
        cellpool = pool;
    }

    return p;
}

// cellfree_pool -- release a queue entry
void
cellfree_pool(Cell *p)
{

    p->next = cellpool;
    cellpool = p;
}

// cellnew_std -- allocate a queue entry
Cell *
cellnew_std(void)
{
    Cell *p;

    p = xalloc(1,sizeof(Cell));

    return p;
}

// cellfree_std -- release a queue entry
void
cellfree_std(Cell *p)
{

    free(p);
}

void
enqueue(TQueue *queue, Task *t)
{
    Cell *p;

    lock_acquire(&task_mutex);

    p = cellnew();
    p->next = NULL;
    p->t = t;

    if (queue->tail == NULL) {
        queue->tail = p;
        queue->head = p;
    }
    else {
        queue->tail->next = p;
        queue->tail = p;
    }

    lock_release(&task_mutex);
}

Task *
dequeue(TQueue *queue)
{
    Task *t;

    lock_acquire(&task_mutex);

    if (empty(queue))
        t = NULL;

    else {
        Cell *p = queue->head;

        if (p == queue->tail)
            queue->tail = NULL;

        queue->head = p->next;

        t = p->t;

        cellfree(p);
    }

    lock_release(&task_mutex);

    return t;
}

void *
work(void *arg)
{
    Thread *tskcur = arg;
    TQueue *queue = tskcur->queue;
    Task *t;
    Log *log;
    long cnt;
    int tprev;
    byte *bitv;
    double tvbeg;
    double tvbef;
    double tvaft;

    log = tskcur->log;
    bitv = tskcur->bitv;
    tvbeg = tvgetf();

    tprev = 0;
    while (1) {
        tvbef = tvgetf();
        t = dequeue(queue);
        tvaft = tvgetf();

        if (t == NULL)
            break;

        // abort if we get a double entry
        if (btvset(bitv,t->number))
            sysfault("work: duplicate\n");

        if (opt_p) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                tvbef,tvaft - tvbef,tskcur->xid,t->number,t->number - tprev);
            tprev = t->number;
            continue;
        }

        log->tvbef = tvbef;
        log->tvaft = tvaft;
        log->number = t->number;
        ++log;
    }

    if (! opt_p) {
        tvaft = tvgetf();

        cnt = log - tskcur->log;
        log = tskcur->log;

        lock_acquire(&print_mutex);

        printf("\n");
        printf("THREAD=%5.5d START=%.9f STOP=%.9f ELAP=%.9f TOTAL=%ld\n",
            tskcur->xid,tvbeg,tvaft,tvaft - tvbeg,cnt);

        tprev = 0;
        for (;  cnt > 0;  --cnt, ++log) {
            printf("[%.9f/%.9f %5.5d] %d [%d]\n",
                log->tvbef,log->tvaft - log->tvbef,tskcur->xid,
                log->number,log->number - tprev);
            tprev = log->number;
        }

        lock_release(&print_mutex);
    }

    return (void *) 0;
}

void
btvchk(Thread *tska,Thread *tskb)
{
    byte *btva;
    byte *btvb;
    byte aval;
    byte bval;
    int idx;

    printf("btvchk: %d ??? %d\n",tska->xid,tskb->xid);

    btva = tska->bitv;
    btvb = tskb->bitv;

    // abort if we get overlapping entries between two threads
    for (idx = 0;  idx < bitvlen;  ++idx) {
        aval = btva[idx];
        bval = btvb[idx];
        if (aval & bval)
            sysfault("btvchk: duplicate\n");
    }
}

// For a simple test i runned this on main:

int
main(int argc,char **argv)
{
    char *cp;
    TQueue *queue;
    Task *t;
    Thread *tsk;

    --argc;
    ++argv;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'p':  // print immediately
            opt_p = 1;
            break;

        case 'Q':  // number of queue items
            opt_Q = atoi(cp + 2);
            break;

        case 'T':  // number of threads
            opt_T = atoi(cp + 2);
            break;

        case 'L':
            opt_L = 1;
            break;

        case 'M':
            opt_M = 1;
            break;

        default:
            break;
        }
    }

    printf("p=%d -- thread log is %s\n",opt_p,opt_p ? "immediate" : "deferred");

    if (opt_T == 0)
        opt_T = 16;
    printf("T=%d (number of threads)\n",opt_T);

    if (opt_Q == 0)
        opt_Q = 1000000;
    printf("Q=%d (number of items to enqueue)\n",opt_Q);

    printf("L=%d -- lock is %s\n",opt_L,opt_L ? "ticket" : "mutex");
    printf("M=%d -- queue item allocation is %s\n",
        opt_M,opt_M ? "pooled" : "malloc/free");

    tvzero = tvgetf();

    lock_init(&task_mutex);
    lock_init(&print_mutex);

    // select queue item allocation strategy
    switch (opt_M) {
    case 1:
        cellnew = cellnew_pool;
        cellfree = cellfree_pool;
        break;

    default:
        cellnew = cellnew_std;
        cellfree = cellfree_std;
        break;
    }

    queue = xalloc(1,sizeof(TQueue));
    startQueue(queue);

    Thread threads[opt_T];

    // get byte length of bit vectors
    bitvlen = BTVLEN(opt_Q + 1);

    // allocate per-thread log buffers
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        if (! opt_p)
            tsk->log = xalloc(opt_Q,sizeof(Log));
        tsk->bitv = xalloc(bitvlen,sizeof(byte));
    }

    // allocate "work to do"
    t = xalloc(opt_Q,sizeof(Task));

    // add to master queue
    for (int i = 0; i < opt_Q; i++) {
        t[i].number = i + 1;
        enqueue(queue, &t[i]);
    }

    // fire up the threads
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        tsk->xid = i + 1;
        tsk->queue = queue;
        pthread_create(&tsk->tid, NULL, work, tsk);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        tsk = &threads[i];
        pthread_join(tsk->tid, NULL);
    }

    // wait for threads to complete
    for (int i = 0; i < opt_T; i++) {
        for (int j = i + 1; j < opt_T; j++)
            btvchk(&threads[i],&threads[j]);
    }

    printf("TOTAL: %.9f\n",tvgetf());

    free(t);

    return 0;
}