Search code examples
cmultithreadingmultiprocessingriscvrocket-chip

Issue with Threads in embedded system


I have 4 Rocket RISC-V cores and each are connected to 4 identical accelerators. What I am trying to do is to use multithreading to run all or some of the accelerators at the same time and compare the performance results among them. I need to measure the time between when the first started thread begins its job until the last started thread finishes its task using csr counters. The task for all threads is the same and they are not dependent on others.

I am struggling measuring the correct time, however, and see that threads start one after other instead of being concurrent (I don't expect them to start at exactly the same time but I need to see some overlap).

What I see for example for two threads on one CPUs for 10 tests:

Thread 0 started at 12612628151 and ended at 12621280072
Thread 1 started at 16418324070 and ended at 16426815938

or two threads on two CPUs:

Thread 0 started at 12599644918 and ended at 12608560166
Thread 1 started at 15271446547 and ended at 15280014314
// expecting Thread 1 starts at 12599900000 for example

How can I measure the time properly? Here is my code:

#define _GNU_SOURCE

#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include <time.h>


#include "encoding.h"
#include "rocc.h"
#include "data/tests.h"

//----------------------------------------------------------------\\
//------------------  UTILITY-------------------------------------\\
//----------------------------------------------------------------\\

uint64_t freq = 25000000; // 25 MHz
volatile uint64_t total_tasks = 0;
volatile double total_time = 0.0;
uint64_t total_clk_cycles = 0;
uint64_t total_insts = 0;
double inst_per_cycles = 0.0;


void print_measurments(double total_time) {
    double tasks_per_second = total_tasks / total_time;
    printf("\nTotal number of instructions: %lu \n", total_insts);
    printf("Total number of cycles: %lu \n", total_clk_cycles);
    printf("Total number of tasks finished: %lu \n", total_tasks);
    printf("Tasks per second with gettime(): %.5f\n", tasks_per_second);
    printf("Tasks per second with csr_cycles(): %f\n", (double)total_tasks / ((double)total_clk_cycles / (double)freq));
}


uint64_t concatenate_ints(unsigned int x, unsigned int y) {
    return ((uint64_t)x << 32) | y;
}

uint64_t concatenate_3ints(uint64_t x, uint64_t y, uint64_t z) {
    return ((uint64_t)x << 48) | (((uint64_t)y << 32) | z);
}

uint64_t concatenate_arrays(int64_t* x, int64_t* y) {
    return ((uint64_t)x << 32) | (uint64_t)y;
}

//################################################################\\

//----------------------------------------------------------------\\
//------------------  THREAD -------------------------------------\\
//----------------------------------------------------------------\\

#define handle_error_en(en, msg) \
       do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)

#define handle_error(msg) \
       do { perror(msg); exit(EXIT_FAILURE); } while (0)

pthread_mutex_t timing_mutex = PTHREAD_MUTEX_INITIALIZER;

struct timespec start_time, end_time;

struct thread_info {
    pthread_t thread_id;
    int thread_num;
    int res;
    int cpumask;
    int accelid;
    test_struct* tests;
    size_t num_tests;
    uint64_t start_time;
    uint64_t end_time;
};

int run(struct thread_info *tinfo);
static void *thread_start(void *arg) {
    struct thread_info *tinfo = arg;
    tinfo->res = run(tinfo);
    return (void *) &(tinfo->res);
}

void initialize_threads(struct thread_info *tinfo, int nthreads, char *argv[], test_struct* tests, size_t num_tests) {
    pthread_attr_t attr;
    int s;

    s = pthread_attr_init(&attr);
    if (s != 0) {
        handle_error_en(s, "pthread_attr_init");
    }

    size_t size = PTHREAD_STACK_MIN + 0x1000000;
    s = pthread_attr_setstacksize(&attr, size);
    if (s != 0) {
        handle_error_en(s, "pthread_attr_setstacksize");
    }

    for (int tnum = 0; tnum < nthreads; tnum++) {
        tinfo[tnum].thread_num = tnum;
        tinfo[tnum].cpumask = atoi(argv[tnum + 2]);
        tinfo[tnum].accelid = tnum; // Match accelid with thread_num
        tinfo[tnum].tests = tests;
        tinfo[tnum].num_tests = num_tests;
        s = pthread_create(&tinfo[tnum].thread_id, &attr, &thread_start, &tinfo[tnum]);
        if (s != 0) {
            handle_error_en(s, "pthread_create");
        }
    }

    s = pthread_attr_destroy(&attr);
    if (s != 0) {
        handle_error_en(s, "pthread_attr_destroy");
    }
}

void join_threads(struct thread_info *tinfo, int nthreads) {
    int s;
    void *tres;

    for (int tnum = 0; tnum < nthreads; tnum++) {
        s = pthread_join(tinfo[tnum].thread_id, &tres);
        if (s != 0) {
            handle_error_en(s, "pthread_join");
        }
    }
}

void set_thread_affinity(pthread_t thread, int cpumask) {
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);

    for (int i = 0; i < 3; ++i) {
        if (cpumask & (1 << i)) {
            CPU_SET(i, &cpuset);
        }
    }

    int result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    if (result != 0) {
        perror("pthread_setaffinity_np");
    }
}

void record_start_time(struct thread_info *tinfo) {
    tinfo->start_time = rdcycle();
}

void record_end_time(struct thread_info *tinfo) {
    tinfo->end_time = rdcycle();
    pthread_mutex_lock(&timing_mutex);
    //total_clk_cycles += (tinfo->end_time - tinfo->start_time);
    pthread_mutex_unlock(&timing_mutex);
}

uint64_t th0_start_cycle = 0, th0_end_cycle = 0;
uint64_t th1_start_cycle = 0, th1_end_cycle = 0;
uint64_t th2_start_cycle = 0, th2_end_cycle = 0;
uint64_t th3_start_cycle = 0, th3_end_cycle = 0;

uint64_t max4(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
     uint64_t max_value = 0;

    if (a != 0) {
        max_value = a;
    }
    if (b != 0 && b > max_value) {
        max_value = b;
    }
    if (c != 0 && c > max_value) {
        max_value = c;
    }
    if (d != 0 && d > max_value) {
        max_value = d;
    }

    return max_value;
}
uint64_t min4(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
    uint64_t min_value = UINT64_MAX;

    if (a != 0) {
        min_value = a;
    }
    if (b != 0 && b < min_value) {
        min_value = b;
    }
    if (c != 0 && c < min_value) {
        min_value = c;
    }
    if (d != 0 && d < min_value) {
        min_value = d;
    }

    return min_value;
}

int run(struct thread_info *tinfo) {
    int threadid = tinfo->thread_num;
    int cpumask = tinfo->cpumask;
    int accelid = tinfo->accelid;
    test_struct *tests = tinfo->tests;
    size_t num_tests = tinfo->num_tests;

    
    // Get the current thread ID and set the affinity
    pthread_t thread = pthread_self();
    set_thread_affinity(thread, cpumask);

    int64_t task_done = 0;
    int num_test_passed = 0;


    uint64_t *data1 = (uint64_t *)malloc(sizeof(uint64_t) * tests[0].data1_packs);
    uint64_t *data2 = (uint64_t *)malloc(sizeof(uint64_t) * tests[0].data2_packs);

    printf("[thread %d] Starting tests\n", threadid);

    record_start_time(tinfo);

    for (size_t i = 0; i < num_tests; i++) {

        uint64_t local_param = ceil(((double)(tests[i].data2_len) / 11));
        uint64_t data1_len_size = concatenate_ints(tests[i].data1_len, tests[i].data1_packs);
        uint64_t data2_len_size_var = concatenate_3ints(tests[i].data2_len, tests[i].data2_packs, local_param);

        if (tests[i].data1 != (i > 0 ? tests[i - 1].data1 : NULL)) {
            for (size_t j = 0; j < tests[i].data1_packs; j++) {
                data1[j] = tests[i].data1[j];
            }
            switch (accelid) {
                case 0:
                    ROCC_INSTRUCTION_SS(0, data1, data1_len_size, 0);
                    break;
                case 1:
                    ROCC_INSTRUCTION_SS(1, data1, data1_len_size, 0);
                    break;
                case 2:
                    ROCC_INSTRUCTION_SS(2, data1, data1_len_size, 0);
                    break;
                case 3:
                    ROCC_INSTRUCTION_SS(3, data1, data1_len_size, 0);
                    break;
                default:
                    break;
            }
        }

        for (size_t j = 0; j < tests[i].data2_packs; j++) {
            data2[j] = tests[i].data2[j];
        }

        switch (accelid) {
            case 0:
                th0_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(0, task_done, data2, data2_len_size_var, 1);
                th0_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 1:
                th1_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(1, task_done, data2, data2_len_size_var, 1);
                th1_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 2:
                th2_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(2, task_done, data2, data2_len_size_var, 1);
                th2_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 3:
                th3_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(3, task_done, data2, data2_len_size_var, 1);
                th3_end_cycle = rdcycle();
                total_tasks++;
                break;
            default:
                break;
        }
        
    }

    uint64_t th_start_time = min4(th0_start_cycle, th1_start_cycle, th2_start_cycle, th3_start_cycle);
    uint64_t th_end_time   = max4(th0_end_cycle,   th1_end_cycle,   th2_end_cycle,   th3_end_cycle);
    
    printf( "th%d start = %ld, th1 start = %ld, th2 start = %ld, th2 start = %ld  => min = %ld \n", threadid, th0_start_cycle, th1_start_cycle, th2_start_cycle, th3_start_cycle, th_start_time);
    printf( "th%d end   = %ld, th1 end   = %ld, th2 end   = %ld, th2 end   = %ld  => max = %ld \n", threadid, th0_end_cycle, th1_end_cycle, th2_end_cycle, th3_end_cycle, th_end_time);
    
    total_clk_cycles += (th_end_time - th_start_time); 
    record_end_time(tinfo);

    free(data2);
    free(data1);


    printf("\n[thread %d] Tests finished.\n", threadid, num_test_passed);
    return 0;
}
/*---------------------------------------------------------------*/
/*---------------------MAIN FUNCTION-----------------------------*/
/*---------------------------------------------------------------*/

int main(int argc, char *argv[]) {
    static int num_tests, num_threads;

    printf("start\n");
    if (argc < 2) {
        printf("Usage: %s <num_tests> <num_threads> <cpumask1> <cpumask2> ... <accelid1> <accelid2> ...\n", argv[0]);
        exit(EXIT_FAILURE);
    }
    num_tests = atoi(argv[1]);
    num_threads = atoi(argv[2]);
    printf("num_tests = %d & num_threads = %d \n", num_tests, num_threads);

    struct thread_info *tinfo = calloc(num_threads, sizeof(*tinfo));
    if (tinfo == NULL) {
        handle_error("calloc");
    }

    clock_gettime(CLOCK_MONOTONIC, &start_time);


    initialize_threads(tinfo, num_threads, argv, tests, num_tests);

    join_threads(tinfo, num_threads);

    clock_gettime(CLOCK_MONOTONIC, &end_time);
    total_time = (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_nsec - start_time.tv_nsec) / 1e9;

    print_measurments(total_time);

    // Print the start and end times of each thread
    for (int i = 0; i < num_threads; i++) {
        printf("Thread %d started at %lu and ended at %lu\n", tinfo[i].thread_num, tinfo[i].start_time, tinfo[i].end_time);
    }

    free(tinfo);

    return EXIT_SUCCESS;
}

And the prints inside run() function shows (again for 10 tests):

[thread 0] Starting tests
[thread 1] Starting tests
th0 start = 12620313755, th1 start = 16425843909, th2 start = 0, th2 start = 0  => min = 12620313755 
th0 end   = 12621128480, th1 end   = 16425836178, th2 end   = 0, th2 end   = 0  => max = 16425836178 
th1 start = 12620313755, th1 start = 16425843909, th2 start = 0, th2 start = 0  => min = 12620313755 
th1 end   = 12621128480, th1 end   = 16426650923, th2 end   = 0, th2 end   = 0  => max = 16426650923 

[thread 0] Tests finished. 

[thread 1] Tests finished.

The theoretical number for number of cycles for each task is about 80,000 for one test. For 10 tests it should be 800,000 cycles. So for example for thread0 above if I subtract the end cylces(12621128480) and start cycles (12620313755) I get 814725 which is close to 800,000.

Ultimately I want to run 16 threads at (almost) the same time on 4 cores for 16 accelerators and measure the time between the start time of the 1'st thread and end time of 16'th thread. Also just to note, the number of tasks being done in this case will be 16 time more (with 10 tests, tasks finished = 160).

Please let me know if you need more information.

Update (tests.h):

uint64_t data2[] = {
    0x5e54c42e48f95681,
    0x6817f0ac737ce56c,
    0x2c25c96e2f08878e,
    0x76e63b8e74593080,
    0x5495a7174a9738cb,
    0x766b718558de643e,
    0x112af5be56f0419d,
    0x39dade335211fa50,
    0x2da48dcf3d051c37,
    0x0c7b1f97089158d8,
    0x51c71c37647dd215,
    0x193c2e614ee73ee9,
    0x6369f4c63ea7bf2d,
    0x1340c33d264fad65,
    0x705d49c404b1a154,
    0x266694b14eb20df2,
    0x4daaf7d50e7e855d,
    0x422ef35f79d0c143,
    0x3d870ceb39152eed,
    0x6e29f1c3121cb402,
    0x03ac67b864956348,
    0x6afb184014d75d76,
    0x3b85a4e524d5f673,
    0x66e957c6692a32b5,
    0x61db12aa7364775d,
    0x71bb8b8d33a22ee2,
    0x57e249720af7b9ee,
    0x02896dcb3b4c3e39,
    0x499f791b15ca3108,
    0x619beb9e39fcc2df,
    0x1a7bd25c0802804f,
    0x08aed0d26826ca32
};
uint64_t data1[] =
{
    0xBBE6139D23887ACF,
    0xA5C3E51BDB1C7101,
    0xE5E8E8377D210A02,
    0xCE1D0EB443C754E8,
    0x52FE52BE649D946B,
    0x841A847EABE22232,
    0xFB6DD72788419F4A,
    0xDAB563352538E68C,
    0xCD809B2A690DD264,
    0x767A3E544BD27E8,
    0xC43B11A46691E3F1,
    0x9D709F1788087B54,
    0x233AC62E6F14D8F0,
    0x9B31EFD15B268F5E,
    0x911AA32FFC2C5485,
    0x52E9C7B33B6B1A8D,
    0xE590F5241B3E896A,
    0x8F15C4A2891F1BD0,
    0xA1831064E81917BF,
    0xA66EE7901ADE7A05,
    0x499600463D2E2DBE,
    0x8FBAFAC1084EA740,
    0x7A3CDBD78AA1F6F8,
    0x229EB0F8F889B710,
    0xB4E48DBDF6F101E1,
    0x1DC8989C5D226B6C,
    0xBC49E8E6570BDB0D,
    0x45446206D4087C,
    0xA945EA2DA761AC00,
    0xC544955BD883EAD7,
    0x2A3772AFA29009F1,
    0xA28C1A9DD2648D10,
    0xDEC2D76F0EEB931C
    };

//structure for the test
typedef struct {
    uint64_t *data1;   
    uint64_t *data2; 
    int data1_packs;   
    int data2_packs;   
    int data1_len;     
    int data2_len;     
} test_struct;
// 6 tests
test_struct tests[] = {
    {data1, data2, 1566, 32, 50000, 1000},
    {data1, data2, 1566, 32, 50000, 1000},
    {data1, data2, 1566, 32, 50000, 1000},
    {data1, data2, 1566, 32, 50000, 1000},
    {data1, data2, 1566, 32, 50000, 1000},
    {data1, data2, 1566, 32, 50000, 1000}
   };

Multicore Test:

I used the script below to use taskset the run the threaded program on 4 CPUs:

#!/bin/bash

gcc -o fix1 fix1.c -pthread -lm

for i in 0 1 2 3; do
  (taskset -c $i ./fix1 -g 300 4 2>&1 | sed "s/^/CPU$i: /") &
done

wait

And the output is (I am only showing the 7 most right digits for end and start time, and min and max times for each cpu are the starting time of the first started thread and end time of the last thread finished, respectively):

CPU0: start
CPU0: t=1 g=1
CPU0: num_tests = 300 & num_threads = 4 
CPU0: tests = 2500 elements
CPU0: d1cnt=1566 d2cnt=32
CPU0: T1 start 4403660 end 4526744 ELAPSED: 123084
CPU0: T2 start 4598096 end 4684928 ELAPSED: 86832
CPU0: T3 start 4717724 end 4804556 ELAPSED: 86832
CPU0: T0 start 4880624 end 4967708 ELAPSED: 87084
CPU0: Minimum start time: 3602931164403660
CPU0: Maximum end time: 3602931164967708
CPU0: Time difference: 564048
CPU0: 
CPU0: Total number of instructions: 0 
CPU0: Total number of cycles: 564048 
CPU0: Total number of Tasks finished: 1204 
CPU0: Tasks per second with gettime(): 4344359.00859
CPU0: Tasks per second with csr_cycles(): 53364.252688
CPU2: start
CPU2: t=1 g=1
CPU2: num_tests = 300 & num_threads = 4 
CPU2: tests = 2500 elements
CPU2: d1cnt=1566 d2cnt=32
CPU2: T1 start 4659980 end 4770068 ELAPSED: 110088
CPU2: T2 start 4854488 end 4941212 ELAPSED: 86724
CPU2: T3 start 4977392 end 5063900 ELAPSED: 86508
CPU2: T0 start 5122508 end 5209736 ELAPSED: 87228
CPU2: Minimum start time: 3602931164659980
CPU2: Maximum end time: 3602931165209736
CPU2: Time difference: 549756
CPU2: 
CPU2: Total number of instructions: 0 
CPU2: Total number of cycles: 549756 
CPU2: Total number of Tasks finished: 1204 
CPU2: Tasks per second with gettime(): 4423364.47568
CPU2: Tasks per second with csr_cycles(): 54751.562511
CPU3: start
CPU3: t=1 g=1
CPU3: num_tests = 300 & num_threads = 4 
CPU3: tests = 2500 elements
CPU3: d1cnt=1566 d2cnt=32
CPU3: T1 start 4951220 end 5055224 ELAPSED: 104004
CPU3: T2 start 5119988 end 5206748 ELAPSED: 86760
CPU3: T3 start 5239832 end 5326556 ELAPSED: 86724
CPU3: T0 start 5390636 end 5477720 ELAPSED: 87084
CPU3: Minimum start time: 3602931164951220
CPU3: Maximum end time: 3602931165477720
CPU3: Time difference: 526500
CPU3: 
CPU3: Total number of instructions: 0 
CPU3: Total number of cycles: 526500 
CPU3: Total number of Tasks finished: 1204 
CPU3: Tasks per second with gettime(): 4134033.32635
CPU3: Tasks per second with csr_cycles(): 57169.990503
CPU1: start
CPU1: t=1 g=1
CPU1: num_tests = 300 & num_threads = 4 
CPU1: tests = 2500 elements
CPU1: d1cnt=1566 d2cnt=32
CPU1: T1 start 991928 end 1097804 ELAPSED: 105876
CPU1: T2 start 1170452 end 1257428 ELAPSED: 86976
CPU1: T3 start 1297820 end 1384400 ELAPSED: 86580
CPU1: T0 start 1448336 end 1535456 ELAPSED: 87120
CPU1: Minimum start time: 3602931190991928
CPU1: Maximum end time: 3602931191535456
CPU1: Time difference: 543528
CPU1: 
CPU1: Total number of instructions: 0 
CPU1: Total number of cycles: 543528 
CPU1: Total number of Tasks finished: 1204 
CPU1: Tasks per second with gettime(): 4486845.04733
CPU1: Tasks per second with csr_cycles(): 55378.931720

Solution

  • Caveat: This is a bit brittle because I had to adapt the code to run on my PC.

    Your issue is more about threads starting/running "sequentially" rather than the actual work being done. Thus, it isn't really dependent on the work being done (instruction simulation), so the problem can be diagnosed by running on a PC.

    Issues with the code:

    1. If the execution time the actual work of a thread is less than the time for pthread_create to execute, the thread execution is (will appear to be) sequential.

    2. That is, a race condition of sorts. After the main thread calls pthread_create, the spawned thread is able run and complete before the return of pthread_create in the main thread.

    3. Can be ameliorated by having longer work times/test counts.

    4. Can be ameliorated by having a "release" flag (e.g. gonow in the code below). Then, threads will only "start" after all threads have been created and are running.

    5. Having any printf inside a thread function distorts the timing (i.e. we're measuring more of the printf time than the real time. And, printf does interthread locking.

    6. In run, the malloc calls assume that all data1_packs and data2_packs are the same value (true at present). Better to use the maximum values

    7. num_tests can exceed the number of tests in the tests array.

    8. You're not quite comfortable in using pointers to a struct. In loops, there are many (e.g.) tinfo[tnum].whatever Better to have a "current" pointer (e.g. struct thread_info *tcur = &tinfo[tnum]; and use tcur->whatever).

    9. Having \\ at the end of comment is flagged by the compiler as an attempt to have a multiline comment continuation (i.e. \ on the end of the line tells the preprocessor to join the lines).

    Changes to run on a PC:

    1. I had the change the ROCC_* into simple nop to enable it to work on a PC (e.g. hacked up rocc.h).
    2. Added a simple/phony version of rdcycle

    FILE: fix1.c

    #define _GNU_SOURCE
    
    #include <pthread.h>
    #include <sched.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <stdint.h>
    #include <errno.h>
    #include <math.h>
    #include <limits.h>
    #include <stdbool.h>
    #include <time.h>
    #include <stdatomic.h>
    
    #include "encoding.h"
    #include "rocc.h"
    #if 0
    #include "data/tests.h"
    #else
    #include "tests.h"
    #endif
    
    #ifdef DEBUG
    #define dbgprt(_fmt...) \
        printf(_fmt)
    #else
    #define dbgprt(_fmt...) \
        do { } while (0)
    #endif
    
    #if 1
    #define ARRCOUNT(_arr) \
        (sizeof(_arr) / sizeof(_arr[0]))
    
    typedef long long tsc_t;
    tsc_t tsczero;
    tsc_t rdzero;
    
    tsc_t
    tscget(void)
    {
        struct timespec ts;
        tsc_t tsc;
    
        clock_gettime(CLOCK_MONOTONIC,&ts);
        tsc = ts.tv_sec;
        tsc *= 1000000000;
        tsc += ts.tv_nsec;
    
        tsc -= tsczero;
    
        return tsc;
    }
    
    double
    tscsec(tsc_t tsc)
    {
        double sec = tsc;
        return sec / 1e9;
    }
    
    int gonow;
    int opt_g = 0;
    int opt_t = 1;
    int d1cnt;
    int d2cnt;
    size_t tests_max;
    #endif
    
    //----------------------------------------------------------------
    //------------------  UTILITY-------------------------------------
    //----------------------------------------------------------------
    
    uint64_t freq = 25000000;               // 25 MHz
    volatile uint64_t total_tasks = 0;
    volatile double total_time = 0.0;
    tsc_t total_clk_cycles = 0;
    uint64_t total_insts = 0;
    double inst_per_cycles = 0.0;
    
    #if 1
    tsc_t
    rdcycle(void)
    {
        tsc_t ret;
    
        if (opt_t)
            ret = tscget();
        else {
            static tsc_t ctr;
            ret = atomic_fetch_add(&ctr,1);
        }
    
        return ret;
    }
    #endif
    
    void
    print_measurments(double total_time)
    {
        double tasks_per_second = total_tasks / total_time;
    
        printf("\nTotal number of instructions: %lu \n", total_insts);
        printf("Total number of cycles: %lld \n", total_clk_cycles);
        printf("Total number of tasks finished: %lu \n", total_tasks);
        printf("Tasks per second with gettime(): %.5f\n", tasks_per_second);
        printf("Tasks per second with csr_cycles(): %f\n",
            (double) total_tasks / ((double) total_clk_cycles / (double) freq));
    }
    
    uint64_t
    concatenate_ints(unsigned int x, unsigned int y)
    {
        return ((uint64_t) x << 32) | y;
    }
    
    uint64_t
    concatenate_3ints(uint64_t x, uint64_t y, uint64_t z)
    {
        return ((uint64_t) x << 48) | (((uint64_t) y << 32) | z);
    }
    
    uint64_t
    concatenate_arrays(int64_t * x, int64_t * y)
    {
        return ((uint64_t) x << 32) | (uint64_t) y;
    }
    
    //################################################################
    
    //----------------------------------------------------------------
    //------------------  THREAD -------------------------------------
    //----------------------------------------------------------------
    
    #define handle_error_en(en, msg) \
           do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
    
    #define handle_error(msg) \
           do { perror(msg); exit(EXIT_FAILURE); } while (0)
    
    pthread_mutex_t timing_mutex = PTHREAD_MUTEX_INITIALIZER;
    
    tsc_t start_time, end_time;
    
    struct thread_info {
        pthread_t thread_id;
        int thread_num;
        int res;
        int cpumask;
        int accelid;
        test_struct *tests;
        size_t num_tests;
        tsc_t start_time;
        tsc_t end_time;
    };
    
    int run(struct thread_info *tinfo);
    
    int thread_first;
    
    static void *
    thread_start(void *arg)
    {
        struct thread_info *tinfo = arg;
    
    // NOTE/FIX -- add a "release" flag
    #if 1
        while (opt_g) {
            int go = atomic_load(&gonow);
            if (go)
                break;
        }
    
        // make the cycle counter relative (optional)?
        if (opt_g) {
            if (atomic_fetch_add(&thread_first,1) == 0)
                rdzero = rdcycle();
        }
    #endif
    
        tinfo->res = run(tinfo);
    
        return (void *) &(tinfo->res);
    }
    
    void
    initialize_threads(struct thread_info *tinfo, int nthreads, char *argv[],
        test_struct *tests, size_t num_tests)
    {
        pthread_attr_t attr;
        int s;
    
        s = pthread_attr_init(&attr);
        if (s != 0) {
            handle_error_en(s, "pthread_attr_init");
        }
    
        size_t size = PTHREAD_STACK_MIN + 0x1000000;
    
        s = pthread_attr_setstacksize(&attr, size);
        if (s != 0) {
            handle_error_en(s, "pthread_attr_setstacksize");
        }
    
        for (int tnum = 0; tnum < nthreads; tnum++) {
            struct thread_info *tcur = &tinfo[tnum];
    
            tcur->thread_num = tnum;
    #if 0
            tcur->cpumask = atoi(argv[tnum]);
    #else
            tcur->cpumask = 0;
    #endif
            tcur->accelid = tnum;       // Match accelid with thread_num
            tcur->tests = tests;
            tcur->num_tests = num_tests;
    
            s = pthread_create(&tcur->thread_id, &attr, &thread_start, tcur);
            if (s != 0) {
                handle_error_en(s, "pthread_create");
            }
        }
    
        s = pthread_attr_destroy(&attr);
        if (s != 0) {
            handle_error_en(s, "pthread_attr_destroy");
        }
    }
    
    void
    join_threads(struct thread_info *tinfo, int nthreads)
    {
        int s;
        void *tres;
    
        for (int tnum = 0; tnum < nthreads; tnum++) {
            s = pthread_join(tinfo[tnum].thread_id, &tres);
            if (s != 0) {
                handle_error_en(s, "pthread_join");
            }
        }
    }
    
    void
    set_thread_affinity(pthread_t thread, int cpumask)
    {
    #if 0
        cpu_set_t cpuset;
    
        CPU_ZERO(&cpuset);
    
        for (int i = 0; i < 3; ++i) {
            if (cpumask & (1 << i)) {
                CPU_SET(i, &cpuset);
            }
        }
    
        int result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    
        if (result != 0) {
            perror("pthread_setaffinity_np");
        }
    #endif
    }
    
    void
    record_start_time(struct thread_info *tinfo)
    {
        tinfo->start_time = rdcycle();
    }
    
    void
    record_end_time(struct thread_info *tinfo)
    {
        tinfo->end_time = rdcycle();
    #if 0
        pthread_mutex_lock(&timing_mutex);
        // total_clk_cycles += (tinfo->end_time - tinfo->start_time);
        pthread_mutex_unlock(&timing_mutex);
    #endif
    }
    
    tsc_t th0_start_cycle = 0,
        th0_end_cycle = 0;
    tsc_t th1_start_cycle = 0,
        th1_end_cycle = 0;
    tsc_t th2_start_cycle = 0,
        th2_end_cycle = 0;
    tsc_t th3_start_cycle = 0,
        th3_end_cycle = 0;
    
    tsc_t
    max4(tsc_t a, tsc_t b, tsc_t c, tsc_t d)
    {
        tsc_t max_value = 0;
    
        if (a != 0) {
            max_value = a;
        }
        if (b != 0 && b > max_value) {
            max_value = b;
        }
        if (c != 0 && c > max_value) {
            max_value = c;
        }
        if (d != 0 && d > max_value) {
            max_value = d;
        }
    
        return max_value;
    }
    
    tsc_t
    min4(tsc_t a, tsc_t b, tsc_t c, tsc_t d)
    {
        tsc_t min_value = INT64_MAX;
    
        if (a != 0) {
            min_value = a;
        }
        if (b != 0 && b < min_value) {
            min_value = b;
        }
        if (c != 0 && c < min_value) {
            min_value = c;
        }
        if (d != 0 && d < min_value) {
            min_value = d;
        }
    
        return min_value;
    }
    
    int
    run(struct thread_info *tinfo)
    {
    #if DEBUG
        int threadid = tinfo->thread_num;
    #endif
        int cpumask = tinfo->cpumask;
        int accelid = tinfo->accelid;
        test_struct *tests = tinfo->tests;
        size_t num_tests = tinfo->num_tests;
    
        // Get the current thread ID and set the affinity
        pthread_t thread = pthread_self();
    
        set_thread_affinity(thread, cpumask);
    
        int64_t task_done = 0;
    #if 0
        int num_test_passed = 0;
    #endif
    
    #if 0
        // NOTE/BUG: this assumes all data1_packs/data2_packs are the same
        uint64_t *data1 = malloc(sizeof(uint64_t) * tests[0].data1_packs);
        uint64_t *data2 = malloc(sizeof(uint64_t) * tests[0].data2_packs);
    #else
        // NOTE/FIX: use maximum counts that we need
        uint64_t *data1 = malloc(sizeof(uint64_t) * d1cnt);
        uint64_t *data2 = malloc(sizeof(uint64_t) * d2cnt);
    #endif
    
        dbgprt("[thread %d] Starting tests\n", threadid);
    
        record_start_time(tinfo);
    
    // NOTE/BUG: num_tests can exceed number of tests in the "tests" array
        for (size_t i = 0; i < num_tests; i++) {
            size_t tidx = i % tests_max;
            test_struct *tcur = &tests[tidx];
    
            uint64_t local_param = ceil(((double) (tcur->data2_len) / 11));
            uint64_t data1_len_size = concatenate_ints(tcur->data1_len,
                tcur->data1_packs);
            uint64_t data2_len_size_var = concatenate_3ints(tcur->data2_len,
                tcur->data2_packs, local_param);
    
    #if 0
    // NOTE/BUG: there is a cleaner way to express this
            if (tests[i].data1 != (i > 0 ? tests[i - 1].data1 : NULL)) {
    #else
            test_struct *tprev = tcur - 1;
            if ((tidx > 0) && (tcur->data1 != tprev->data1)) {
    #endif
                for (size_t j = 0; j < tcur->data1_packs; j++) {
                    data1[j] = tcur->data1[j];
                }
                switch (accelid) {
                case 0:
                    ROCC_INSTRUCTION_SS(0, data1, data1_len_size, 0);
                    break;
                case 1:
                    ROCC_INSTRUCTION_SS(1, data1, data1_len_size, 0);
                    break;
                case 2:
                    ROCC_INSTRUCTION_SS(2, data1, data1_len_size, 0);
                    break;
                case 3:
                    ROCC_INSTRUCTION_SS(3, data1, data1_len_size, 0);
                    break;
                default:
                    break;
                }
            }
    
    #if 1
            tprev = tcur;
    #endif
    
            for (size_t j = 0; j < tcur->data2_packs; j++) {
                data2[j] = tcur->data2[j];
            }
    
            switch (accelid) {
            case 0:
                th0_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(0, task_done, data2, data2_len_size_var, 1);
                th0_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 1:
                th1_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(1, task_done, data2, data2_len_size_var, 1);
                th1_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 2:
                th2_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(2, task_done, data2, data2_len_size_var, 1);
                th2_end_cycle = rdcycle();
                total_tasks++;
                break;
            case 3:
                th3_start_cycle = rdcycle();
                ROCC_INSTRUCTION_DSS(3, task_done, data2, data2_len_size_var, 1);
                th3_end_cycle = rdcycle();
                total_tasks++;
                break;
            default:
                break;
            }
        }
    
        tsc_t th_start_time = min4(th0_start_cycle, th1_start_cycle, th2_start_cycle, th3_start_cycle);
        tsc_t th_end_time = max4(th0_end_cycle, th1_end_cycle, th2_end_cycle, th3_end_cycle);
    
        dbgprt("th%d start = %ldd, th1 start = %ldd, th2 start = %ldd, th2 start = %ldd  => min = %ldd \n",
            threadid, th0_start_cycle, th1_start_cycle, th2_start_cycle,
            th3_start_cycle, th_start_time);
    
        dbgprt("th%d end   = %lld, th1 end   = %lld, th2 end   = %lld, th2 end   = %lld  => max = %lld \n",
            threadid, th0_end_cycle, th1_end_cycle, th2_end_cycle,
            th3_end_cycle, th_end_time);
    
        total_clk_cycles += (th_end_time - th_start_time);
        record_end_time(tinfo);
    
        do {
            if (0)
                break;
            if (task_done)
                ++task_done;
        } while (0);
    
        free(data2);
        free(data1);
    
        dbgprt("\n[thread %d] Tests finished.\n", threadid, num_test_passed);
        return 0;
    }
    
    /*---------------------------------------------------------------*/
    /*---------------------MAIN FUNCTION-----------------------------*/
    /*---------------------------------------------------------------*/
    
    static int num_tests, num_threads;
    
    void
    show_elapsed(struct thread_info *tinfo,int sorted)
    {
    
        printf("\n");
    
        struct thread_info *tprev = tinfo;
        for (int i = 0; i < num_threads; i++, tinfo++) {
            printf("T%d start %lld end %lld",
                tinfo->thread_num, tinfo->start_time, tinfo->end_time);
    
            printf(" ELAPSED: %lld",tinfo->end_time - tinfo->start_time);
    
            printf(" (%lld / %lld)",
                tinfo->start_time - tprev->start_time,
                tinfo->end_time - tprev->end_time);
    
            printf("\n");
            tprev = tinfo;
        }
    }
    
    int
    tinfocmp(const void *vlhs,const void *vrhs)
    {
        const struct thread_info *tlhs = vlhs;
        const struct thread_info *trhs = vrhs;
        return tlhs->start_time - trhs->start_time;
    }
    
    int
    main(int argc, char **argv)
    {
    
        tsczero = tscget();
    
        printf("start\n");
        if (argc < 2) {
            printf("Usage: %s <num_tests> <num_threads> <cpumask1> <cpumask2> ... <accelid1> <accelid2> ...\n", argv[0]);
            exit(EXIT_FAILURE);
        }
    
        --argc;
        ++argv;
    
        for (;  argc > 0;  --argc, ++argv) {
            char *cp = *argv;
            if (*cp != '-')
                break;
    
            cp += 2;
            switch (cp[-1]) {
            case 'g':
                opt_g = ! opt_g;
                break;
            case 't':
                opt_t = ! opt_t;
                break;
            }
        }
        printf("t=%d g=%d\n",opt_t,opt_g);
    
        num_tests = atoi(argv[0]);
        num_threads = atoi(argv[1]);
        printf("num_tests = %d & num_threads = %d \n", num_tests, num_threads);
    
        tests_max = ARRCOUNT(tests);
        printf("tests = %zu elements\n",tests_max);
        if (num_tests > tests_max)
            printf("WARNING num_tests too large -- test will wrap the index\n");
    
        struct thread_info *tinfo = calloc(num_threads, sizeof(*tinfo));
        if (tinfo == NULL) {
            handle_error("calloc");
        }
    
        // NOTE/FIX: get maximum counts that we need
    #if 1
        d1cnt = 0;
        d2cnt = 0;
        for (size_t i = 0; i < tests_max; i++) {
            test_struct *tcur = &tests[i];
    
            if (tcur->data1_packs > d1cnt)
                d1cnt = tcur->data1_packs;
    
            if (tcur->data2_packs > d2cnt)
                d2cnt = tcur->data2_packs;
        }
        printf("d1cnt=%d d2cnt=%d\n",d1cnt,d2cnt);
    #endif
    
        start_time = tscget();
    
        initialize_threads(tinfo, num_threads, argv, tests, num_tests);
    
        atomic_store(&gonow,1);
    
        join_threads(tinfo, num_threads);
    
        end_time = tscget();
        total_time = tscsec(end_time - start_time);
    
        print_measurments(total_time);
    
        // Print the start and end times of each thread
        show_elapsed(tinfo,0);
    
        qsort(tinfo,num_threads,sizeof(*tinfo),tinfocmp);
        show_elapsed(tinfo,1);
    
        free(tinfo);
    
        return EXIT_SUCCESS;
    }
    

    FILE: rocc.h

    // Based on code by Schuyler Eldridge. Copyright (c) Boston University
    // https://github.com/seldridge/rocket-rocc-examples/blob/master/src/main/c/rocc.h
    
    #ifndef SRC_MAIN_C_ROCC_H
    #define SRC_MAIN_C_ROCC_H
    
    #include <stdint.h>
    
    #define STR1(x) #x
    #define STR(x) STR1(x)
    #define EXTRACT(a, size, offset) (((~(~0 << size) << offset) & a) >> offset)
    
    #define CUSTOMX_OPCODE(x) CUSTOM_ ## x
    #define CUSTOM_0 0b0001011
    #define CUSTOM_1 0b0101011
    #define CUSTOM_2 0b1011011
    #define CUSTOM_3 0b1111011
    
    #define CUSTOMX(X, xd, xs1, xs2, rd, rs1, rs2, funct) \
      CUSTOMX_OPCODE(X)                     |             \
      (rd                 << (7))           |             \
      (xs2                << (7+5))         |             \
      (xs1                << (7+5+1))       |             \
      (xd                 << (7+5+2))       |             \
      (rs1                << (7+5+3))       |             \
      (rs2                << (7+5+3+5))     |             \
      (EXTRACT(funct, 7, 0) << (7+5+3+5+5))
    
    // Standard macro that passes rd, rs1, and rs2 via registers
    #define ROCC_INSTRUCTION_DSS(X, rd, rs1, rs2, funct) \
        ROCC_INSTRUCTION_R_R_R(X, rd, rs1, rs2, funct, 10, 11, 12)
    
    #define ROCC_INSTRUCTION_DS(X, rd, rs1, funct) \
        ROCC_INSTRUCTION_R_R_I(X, rd, rs1, 0, funct, 10, 11)
    
    #define ROCC_INSTRUCTION_D(X, rd, funct) \
        ROCC_INSTRUCTION_R_I_I(X, rd, 0, 0, funct, 10)
    
    #define ROCC_INSTRUCTION_SS(X, rs1, rs2, funct) \
        ROCC_INSTRUCTION_I_R_R(X, 0, rs1, rs2, funct, 11, 12)
    
    #define ROCC_INSTRUCTION_S(X, rs1, funct) \
        ROCC_INSTRUCTION_I_R_I(X, 0, rs1, 0, funct, 11)
    
    #define ROCC_INSTRUCTION(X, funct) \
        ROCC_INSTRUCTION_I_I_I(X, 0, 0, 0, funct)
    
    // rd, rs1, and rs2 are data
    // rd_n, rs_1, and rs2_n are the register numbers to use
    #define ROCC_INSTRUCTION_R_R_R(X, rd, rs1, rs2, funct, rd_n, rs1_n, rs2_n) { \
        uint64_t rd_ ;                                 \
        uint64_t rs1_ = (uint64_t) rs1;               \
        uint64_t rs2_ = (uint64_t) rs2;               \
        asm volatile (                                                           \
            "\tnop\n"  \
            : "=r" (rd_)                                                         \
            : [_rs1] "r" (rs1_), [_rs2] "r" (rs2_));                             \
        rd = rd_;                                                                \
      }
    
    #define ROCC_INSTRUCTION_R_R_I(X, rd, rs1, rs2, funct, rd_n, rs1_n) {     \
        uint64_t rd_ ;                              \
        uint64_t rs1_ = (uint64_t) rs1;            \
        asm volatile (                                                        \
            "\tnop\n" \
            : "=r" (rd_) : [_rs1] "r" (rs1_));                                \
        rd = rd_;                                                             \
      }
    
    #define ROCC_INSTRUCTION_R_I_I(X, rd, rs1, rs2, funct, rd_n) {           \
        uint64_t rd_ ;                             \
        asm volatile (                                                       \
            "\tnop\n"  \
            : "=r" (rd_));                                                   \
        rd = rd_;                                                            \
      }
    
    #define ROCC_INSTRUCTION_I_R_R(X, rd, rs1, rs2, funct, rs1_n, rs2_n) {    \
        uint64_t rs1_ = (uint64_t) rs1;            \
        uint64_t rs2_ = (uint64_t) rs2;            \
        asm volatile (                                                        \
            "\tnop\n" \
            :: [_rs1] "r" (rs1_), [_rs2] "r" (rs2_));                         \
      }
    
    #define ROCC_INSTRUCTION_I_R_I(X, rd, rs1, rs2, funct, rs1_n) {         \
        uint64_t rs1_ = (uint64_t) rs1;          \
        asm volatile (                                                      \
            "\tnop\n" \
            :: [_rs1] "r" (rs1_));                                          \
      }
    
    #define ROCC_INSTRUCTION_I_I_I(X, rd, rs1, rs2, funct) {                 \
        asm volatile (                                                       \
            "\tnop\n" ); \
      }
    
    #endif  // SRC_MAIN_C_ACCUMULATOR_H
    

    In the code above, I've used cpp conditionals to denote old vs. new code:

    #if 0
    // old code
    #else
    // new code
    #endif
    
    #if 1
    // new code
    #endif
    

    Note: this can be cleaned up by running the file through unifdef -k


    Here is the output of ./fix1 -g 300 8. Note that due to changes to rdcycle, cycle counts are nanoseconds:

    start
    t=1 g=1
    num_tests = 300 & num_threads = 8
    tests = 6 elements
    WARNING num_tests too large -- test will wrap the index
    d1cnt=1566 d2cnt=32
    
    Total number of instructions: 0
    Total number of cycles: 104387
    Total number of tasks finished: 878
    Tasks per second with gettime(): 1800253.83989
    Tasks per second with csr_cycles(): 210275.225842
    
    T0 start 377562 end 443625 ELAPSED: 66063 (0 / 0)
    T1 start 379604 end 455869 ELAPSED: 76265 (2042 / 12244)
    T2 start 379700 end 466956 ELAPSED: 87256 (96 / 11087)
    T3 start 378780 end 466888 ELAPSED: 88108 (-920 / -68)
    T4 start 373805 end 404777 ELAPSED: 30972 (-4975 / -62111)
    T5 start 380184 end 418223 ELAPSED: 38039 (6379 / 13446)
    T6 start 438384 end 464627 ELAPSED: 26243 (58200 / 46404)
    T7 start 500486 end 515786 ELAPSED: 15300 (62102 / 51159)
    
    T4 start 373805 end 404777 ELAPSED: 30972 (0 / 0)
    T0 start 377562 end 443625 ELAPSED: 66063 (3757 / 38848)
    T3 start 378780 end 466888 ELAPSED: 88108 (1218 / 23263)
    T1 start 379604 end 455869 ELAPSED: 76265 (824 / -11019)
    T2 start 379700 end 466956 ELAPSED: 87256 (96 / 11087)
    T5 start 380184 end 418223 ELAPSED: 38039 (484 / -48733)
    T6 start 438384 end 464627 ELAPSED: 26243 (58200 / 46404)
    T7 start 500486 end 515786 ELAPSED: 15300 (62102 / 51159)
    

    UPDATE:

    Just one question, why did you comment out the set_affinity function and cpu_mask?

    For a few reasons:

    1. At the time, I took a cursory look at your affinity code and didn't understand how it worked (so I commented it out).
    2. Now, after more careful examination, your affinity code has a bug [more on that below].
    3. Convenience of not having to specify an extra N args for cpumask and N args for accelid. The accelerator had no meaning on my PC.
    4. The linux scheduler (especially these days) is pretty good at distributing the load evenly balanced across N cpus.
    5. Because of the dynamic nature of scheduling, with a "small" set of available/possible CPUs and a homogeneous mix of threads, locking a thread to a particular subset of cores may degrade throughput and/or latency.

    because when I want to use them and threads on two CPUs (each CPU will have 4 threads) the threads will be sequential again.

    Not quite true. If we don't specify CPU affinity, the scheduler will start threads on whatever CPU it chooses. It could start them all on the same CPU, but, more likely, it will assign them in some order (e.g. round robin(?)).

    CPU affinity has more and better effect:

    1. The CPUs are not homogeneous. For example, some CPUs have more cache memory than others, the system is NUMA (and some CPUs have faster RAM access or more total RAM), a subset of the CPUs have attached coprocessors, etc.
    2. The thread mix for a job is heterogeneous. For example, a realtime video decoder. Input and output threads must have high priority and minimize latency. Decoding threads (video and/or audio) must have high throughput but latency is less of a concern.
    3. Interrupt latency must be minimized. A given H/W device's IRQ affinity is locked to a subset of CPU cores (usually just one) and the thread that must wake up to process the data is similarly restricted.
    4. A large number of CPUs (e.g. 256) and we wish to provide variable QoS based on policy. That is, certain users "pay extra" (are given priority for) for exclusive use of a subset of the CPUs.

    Now, about your affinity code ... It is actually broken.

    I tried command like: ./program -g 8 0 0 0 0 1 1 1 1 0 1 2 3 0 1 2 3 Which means I am using 8 threads, the first four on CPU 0, the second four on CPU 1, then first thread will send instructions to accel 0 and second thread sends to accel 1 and so on.

    Side note: In the example you just gave, you left off the "number of tests" [first] argument.

    It can be easier to specify an affinity mask in hex. strtol (with base of 0) will allow/understand 0xFF but atoi will give back zero for this.

    Here is a test program:

    // mask.c -- cpu mask test
    
    #include <stdio.h>
    #include <stdlib.h>
    
    void
    orig(char *cp)
    {
    
        int mask = strtol(cp,&cp,0);
        printf(" %8.8X %d --",mask,mask);
    
        for (int i = 0; i < 3; ++i) {
            if (mask & (1 << i))
                printf(" SET(%d)",i);
        }
    }
    
    void
    fix1(char *cp)
    {
    
        int i = strtol(cp,&cp,0);
        printf(" %8.8X %d --",i,i);
    
        printf(" SET(%d)",i);
    }
    
    void
    fix2(char *cp)
    {
    
        unsigned int mask = strtol(cp,&cp,0);
        printf(" %8.8X %d --",mask,mask);
    
        for (int i = 0;  mask != 0;  ++i, mask >>= 1) {
            if (mask & 1)
                printf(" SET(%d)",i);
        }
    }
    
    void
    fix3(char *cp)
    {
    
        while (*cp != 0) {
            int i = strtol(cp,&cp,10);
    
            printf(" SET(%d)",i);
    
            if (*cp != ',')
                break;
    
            ++cp;
        }
    }
    
    void
    dofnc(void (*fnc)(char *),const char *sym,char *cp)
    {
    
        printf("%s: argv='%s'",sym,cp);
        fnc(cp);
        printf("\n");
    }
    
    void
    dotest(void (*fnc)(char *),const char *sym,char **argv)
    {
    
        for (char **av = argv;  *av != NULL;  ++av)
            dofnc(fnc,sym,*av);
    }
    
    int
    main(int argc,char **argv)
    {
        int opt_f = 0;
    
        --argc;
        ++argv;
    
        for (;  argc > 0;  --argc, ++argv) {
            char *cp = *argv;
            if (*cp != '-')
                break;
    
            cp += 2;
            switch (cp[-1]) {
            case 'f':
                opt_f = (*cp != 0) ? atoi(cp) : 0;
                break;
            }
        }
    
        switch (opt_f) {
        case 1:
            dotest(fix1,"fix1",argv);
            break;
        case 2:
            dotest(fix2,"fix2",argv);
            break;
        case 3:
            dotest(fix3,"fix3",argv);
            break;
        default:
            dotest(orig,"orig",argv);
            break;
        }
    
        return 0;
    }
    
    #if 0
    void
    set_thread_affinity(pthread_t thread, int cpumask)
    {
        cpu_set_t cpuset;
    
        CPU_ZERO(&cpuset);
    
        for (int i = 0; i < 3; ++i) {
            if (cpumask & (1 << i)) {
                CPU_SET(i, &cpuset);
            }
        }
    
        int result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    
        if (result != 0) {
            perror("pthread_setaffinity_np");
        }
    }
    #endif
    

    Here is the program output for ./mask -f0 0 1 2 3 4, which is your original code:

    orig: argv='0' 00000000 0 --
    orig: argv='1' 00000001 1 -- SET(0)
    orig: argv='2' 00000002 2 -- SET(1)
    orig: argv='3' 00000003 3 -- SET(0) SET(1)
    orig: argv='4' 00000004 4 -- SET(2)
    

    Note that an arg of 0 should set the affinity to cpu 0. But, it sets nothing. The affinity mask is all zeroes!

    This ... might ... be ... bad ...

    As you have it, the number isn't a "mask" of CPUs but a CPU "number" to assign a thread to. That is, we can only assign a thread to one CPU and not a subset (i.e. a true mask).

    And, your code doesn't do that.

    Here is the output of ./mask -f1 0 1 2 3 4 (which does set a single CPU):

    fix1: argv='0' 00000000 0 -- SET(0)
    fix1: argv='1' 00000001 1 -- SET(1)
    fix1: argv='2' 00000002 2 -- SET(2)
    fix1: argv='3' 00000003 3 -- SET(3)
    fix1: argv='4' 00000004 4 -- SET(4)
    

    An improvement is to use a hexadecimal mask. Here is the output of ./mask -f2 0x0003 0x000C 0x0300 0x0C00:

    fix2: argv='0x0003' 00000003 3 -- SET(0) SET(1)
    fix2: argv='0x000C' 0000000C 12 -- SET(2) SET(3)
    fix2: argv='0x0300' 00000300 768 -- SET(8) SET(9)
    fix2: argv='0x0C00' 00000C00 3072 -- SET(10) SET(11)
    

    But, specifying a cpu mask in hex in this way is cumbersome. It might be easier if we can give a list of CPUs to assign. Here is the output of: ./mask -f3 0,1 2,3 4,5 6,7:

    fix3: argv='0,1' SET(0) SET(1)
    fix3: argv='2,3' SET(2) SET(3)
    fix3: argv='4,5' SET(4) SET(5)
    fix3: argv='6,7' SET(6) SET(7)
    

    I might change the organization of the arguments. It's hard to match up the CPU's affinity with its accelerator id. That is:

    0,1 2,3 4,5 6,7 0 1 2 3
    

    It's a bit of a stretch to see that thread 3 should operate on CPUs 6 and 7 and that it should be run on accelerator 3. Maybe we need a syntax like:

    0,1@0 2,3@1 4,5@2 6,7@3
    

    Or, a config file might be better:

    [config_1]
        thread 0 cpu=0,1 accel=0
        thread 1 cpu=2,3 accel=1
        thread 2 cpu=4,5 accel=2
        thread 3 cpu=6,7 accel=3
    
    [config_2]
        thread 0 cpu=0,1 accel=3
        thread 1 cpu=2,3 accel=2
        thread 2 cpu=4,5 accel=1
        thread 3 cpu=6,7 accel=0