I'm trying to determine the associativity of my processor. I have Intel core i5-2500:
L1 data: 32 Kb, 8-way set associative
L1 instruction: 32 Kb, 8-way set associative
L2: 256 Kb, 8-way set associative
L3: 6 Mb, 12-way set associative, shared between all cores
I measure the average access time to an element of an array in processor ticks. The array is divided into fragments.
In the loop, I increase the number of fragments. The distance between two adjacent fragments is equal to L3 cache size. I access first elements of all fragments, then second elements, and so on. Each element contains the index of the next element. Last element contains the index of the first.
It looks like this: enter image description here
When the number of fragments will be greater than associativity of the cache, the average access time should increase.
I got the following results: enter image description here
The first jump corresponds to associativity of TLB, second corresponds to associativity of L1 and L2 caches, but I do not understand why the time does not increase after exceeding associativity of L3 cache.
I also tried different sizes and offsets
Am i doing something wrong? Or i have some mistakes in code?
Can you please explain it to me. Here is the code:
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#define SIZE 6291456 //6 Mb
#define OFFSET 6291456
#define ROUNDS 200
#define MIN_FRAGMENTS_COUNT 1
#define MAX_FRAGMENTS_COUNT 32
void FreeArray(int* array, int size) {
assert(size > 0 && "Size must be gerater than zero\n");
if (NULL == array) {
return;
}
for (int i = 0; i < size; ++i) {
array[i] = 0;
}
free(array);
}
int* CreateArray(int size) {
assert(size > 0 && "Size must be greater than zero\n");
return calloc(size, sizeof(int));
}
unsigned long long int GetTicksCount(void) {
unsigned int high = 0;
unsigned int low = 0;
__asm__ __volatile__("rdtsc" : "=a"(low), "=d"(high));
return (((unsigned long long int)high) << 32 | (unsigned long long int)low);
}
void SetIndexes(int* array, int fragment_size, int offset,
int fragments_count) {
assert(NULL != array && "Pointer to array must not be NULL\n");
assert(fragment_size > 0 && "Fragmnet size must be greater than zero\n");
assert(offset > 0 && "Offset must be greater than zero\n");
assert(fragments_count > 0 && "Fragments count must be greater than zero\n");
assert(fragment_size <= offset &&
"Fragment size must not be greater than offset\n");
int last_fragment = fragments_count - 1;
int last_element = fragment_size - 1;
for (int i = 0; i < last_element; ++i) {
for (int j = 0; j < last_fragment; ++j) {
array[j * offset + i] = (j + 1) * offset + i; //Go in the same element of next fragment
}
array[last_fragment * offset + i] = i + 1; // Go in the next element from last fragment
}
array[last_fragment * offset + last_element] = 0; // Go in first element from last element
}
unsigned long long int CalcAccessTime(int* array, int size) {
assert(NULL != array && "Pointer to array must not be NULL\n");
assert(size > 0 && "Size must be greater than zero\n");
unsigned long long int start = 0;
unsigned long long int end = 0;
unsigned long long int min_time = ULLONG_MAX;
int index = 0;
for (int i = 0; i < ROUNDS; ++i) {
start = GetTicksCount();
for (int j = 0; j < size; ++j) {
index = array[index];
}
end = GetTicksCount();
unsigned long long int cur_time = (end - start) / size;
if (cur_time < min_time) {
min_time = cur_time;
}
}
return min_time;
}
int main(int argc, char** argv) {
int integers_count = SIZE / sizeof(int);
int offset_int = OFFSET / sizeof(int);
for (int i = MIN_FRAGMENTS_COUNT; i <= MAX_FRAGMENTS_COUNT; ++i) {
int size = i * offset_int;
int* array = CreateArray(size);
if (NULL == array) {
return -1;
}
SetIndexes(array, integers_count / i, offset_int, i);
printf("Fragments: %d\n", i);
printf("Time: %llu\n", CalcAccessTime(array, integers_count));
FreeArray(array, size);
}
return 0;
}
Unlike the core caches, the LLC has an additional level of set mapping.
You usually don't want entire regions of memory to map into adjacent sets in the same LLC slice, because then a core that is far from that slice would have horrible access times. Instead, for the common case, you want your data to be spread out as uniformly as possible. To achieve that, they added a hash function as part of the mapping that determined the slice.
The exception is use-cases where you want what's called "sub-NUMA clustering" or "core-clustering" - in these cases you can override that distribution by explicit configuration.
The hashing should work on higher bits as well to better scatter memory regions, so skipping by the LLC size won't work. You're still landing on the same set in multiple slices, which means that you get effectively the associativity multiplied by the number of slices. However, depending on you structure alignment, you wouldn't necessarily get optimal distribution, so you could start seeing jumps ahead of that associativity level.
Here's a paper with some more details about slice hashing: https://arxiv.org/pdf/1508.03767.pdf