Inconsistency in performance of empty kernel with multi threads

Below code is to check the performance of the empty kernels (to verify the dispatch rate of the kernel) with multi threads using std async.

#include <stdio.h>
#include <stddef.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <algorithm>
#include <atomic>
#include <thread>
#include <future>
#include <functional>


#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT

__global__ void EmptyKernel() {}

void print_timing(std::string test, std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1)
{

    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;

    // remove top outliers due to nature of variability across large number of multi-threaded runs
    std::sort(results.begin(), results.end(), std::greater<float>());
    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
    auto end_iter = results.end();

    // mean
    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
        total_us += (run_ms * 1000) / batch;
    });
    mean_us = total_us  / TIMING_RUN_COUNT;

   // stddev
    total_us = 0;
    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
        float dev_us = ((run_ms * 1000) / batch) - mean_us;
        total_us += dev_us * dev_us;
    });
    stddev_us = sqrt(total_us / TIMING_RUN_COUNT);

    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}

void kernel_enqueue_rate(std::atomic_int* shared, int max_threads)
{
    //resources necessary for this thread
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    std::array<float, TOTAL_RUN_COUNT> results;

    //synchronize all threads, before running
    int tid = shared->fetch_add(1, std::memory_order_release);
    while (max_threads != shared->load(std::memory_order_acquire)) {}

    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
        auto start = std::chrono::high_resolution_clock::now();
        EmptyKernel<<<1, 1, 0, stream>>>();
        auto stop = std::chrono::high_resolution_clock::now();
        results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
    }
    print_timing("Thread ID : " + std::to_string(tid) + " , " + "Kernel enqueue rate", results);
}


// Simple thread pool
struct thread_pool {
    thread_pool(int total_threads) : max_threads(total_threads) {}
    void start(std::function<void(std::atomic_int*, int)> f) {
        for (int i = 0; i < max_threads; ++i) {
            threads.push_back(std::async(std::launch::async, f, &shared, max_threads));
        }
    }
    void finish() {
        for (auto&&thread : threads) {
            thread.get();
        }
        threads.clear();
        shared = {0};
    }
    ~thread_pool() {
        finish();
    }
private:
    std::atomic_int shared {0};
    std::vector<std::future<void>> threads;
    int max_threads = 1;
};

int main(int argc, char* argv[])
{
    int max_threads = 4;

    thread_pool task(max_threads);

    task.start(kernel_enqueue_rate);
    task.finish();

}

The observation is that few threads takes much more time than the other threads, for example in the below run, 2 threads take approx 6 us but the other 2 threads take close to or more than 10 us.

 Thread ID : 0 , Kernel enqueue rate enqueue rate: 9.5 us, std: 9.3 us

 Thread ID : 2 , Kernel enqueue rate enqueue rate: 5.7 us, std: 2.9 us

 Thread ID : 1 , Kernel enqueue rate enqueue rate: 11.7 us, std: 7.3 us

 Thread ID : 3 , Kernel enqueue rate enqueue rate: 6.0 us, std: 2.1 us

what is the reason for this significant difference and is there a way to avoid this and get similar results on all the threads.

Solution

what is the reason for this significant difference

The threads are migrating across cores and contending for the cores alongside other processes. The interference effects are not uniform across threads.

is there a way to avoid this and get similar results on all the threads

By pinning the threads to cores which can be done using taskset or programmatically as explained here