Below code is to check the performance of the empty kernels (to verify the dispatch rate of the kernel) with multi threads using std async.
#include <stdio.h>
#include <stddef.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <algorithm>
#include <atomic>
#include <thread>
#include <future>
#include <functional>
#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
__global__ void EmptyKernel() {}
void print_timing(std::string test, std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1)
{
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
// remove top outliers due to nature of variability across large number of multi-threaded runs
std::sort(results.begin(), results.end(), std::greater<float>());
auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
auto end_iter = results.end();
// mean
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
total_us += (run_ms * 1000) / batch;
});
mean_us = total_us / TIMING_RUN_COUNT;
// stddev
total_us = 0;
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
float dev_us = ((run_ms * 1000) / batch) - mean_us;
total_us += dev_us * dev_us;
});
stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}
void kernel_enqueue_rate(std::atomic_int* shared, int max_threads)
{
//resources necessary for this thread
cudaStream_t stream;
cudaStreamCreate(&stream);
std::array<float, TOTAL_RUN_COUNT> results;
//synchronize all threads, before running
int tid = shared->fetch_add(1, std::memory_order_release);
while (max_threads != shared->load(std::memory_order_acquire)) {}
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
EmptyKernel<<<1, 1, 0, stream>>>();
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
}
print_timing("Thread ID : " + std::to_string(tid) + " , " + "Kernel enqueue rate", results);
}
// Simple thread pool
struct thread_pool {
thread_pool(int total_threads) : max_threads(total_threads) {}
void start(std::function<void(std::atomic_int*, int)> f) {
for (int i = 0; i < max_threads; ++i) {
threads.push_back(std::async(std::launch::async, f, &shared, max_threads));
}
}
void finish() {
for (auto&&thread : threads) {
thread.get();
}
threads.clear();
shared = {0};
}
~thread_pool() {
finish();
}
private:
std::atomic_int shared {0};
std::vector<std::future<void>> threads;
int max_threads = 1;
};
int main(int argc, char* argv[])
{
int max_threads = 4;
thread_pool task(max_threads);
task.start(kernel_enqueue_rate);
task.finish();
}
The observation is that few threads takes much more time than the other threads, for example in the below run, 2 threads take approx 6 us but the other 2 threads take close to or more than 10 us.
Thread ID : 0 , Kernel enqueue rate enqueue rate: 9.5 us, std: 9.3 us
Thread ID : 2 , Kernel enqueue rate enqueue rate: 5.7 us, std: 2.9 us
Thread ID : 1 , Kernel enqueue rate enqueue rate: 11.7 us, std: 7.3 us
Thread ID : 3 , Kernel enqueue rate enqueue rate: 6.0 us, std: 2.1 us
what is the reason for this significant difference and is there a way to avoid this and get similar results on all the threads.
what is the reason for this significant difference
The threads are migrating across cores and contending for the cores alongside other processes. The interference effects are not uniform across threads.
is there a way to avoid this and get similar results on all the threads
By pinning the threads to cores which can be done using taskset
or programmatically as explained here