Hey there I am currently experimenting with very low-level C# mainly to learn and bring my skills to the next level for a performance heavy project. I tried to create sth like a NativeList so a List without any automatic garbage collection. I want to use structs and manual pointer allocation to create a memory and cache-line aligned List implementation. This may look familiar for the Unity people out here.
The issue I am facing right now that I get some unexpected results after benchmarking using BenchmarkDotnet. Reads are as expected a bit faster mainly because I use pointers directly (I assume) and array access is pretty fast in C# anyway. Writes are a lot faster ~70% actually which is confusing but may be caused by the memory alignment. Add however is ~5% slower and I can't figure out why.
Please note that there are zero allocations involved in the benchmark because both Lists keep their internal buffer alive. The only part left are the if conditions which are very similar to what List does internally.
Do you have some suggestions to make this even faster or can tell me why my list has slower Adds?
Method | Size | Mean | Error | StdDev | Ratio | RatioSD | CacheMisses/Op |
---|---|---|---|---|---|---|---|
Native_Add | 100 | 934.59 ns | 5.828 ns | 4.866 ns | 1.04 | 0.01 | 2 |
List_Add | 100 | 894.44 ns | 7.063 ns | 6.607 ns | 1.00 | 0.00 | 1 |
Native_Add | 1000 | 9,463.38 ns | 183.577 ns | 162.736 ns | 1.03 | 0.03 | 16 |
List_Add | 1000 | 9,196.11 ns | 87.191 ns | 81.558 ns | 1.00 | 0.00 | 17 |
Native_Add | 10000 | 93,361.52 ns | 876.444 ns | 776.945 ns | 1.05 | 0.01 | 112 |
List_Add | 10000 | 88,501.43 ns | 362.486 ns | 302.692 ns | 1.00 | 0.00 | 53 |
Native_Get | 100 | 41.38 ns | 0.371 ns | 0.347 ns | 0.94 | 0.02 | 0 |
List_Get | 100 | 43.93 ns | 0.849 ns | 0.794 ns | 1.00 | 0.00 | 0 |
Native_Get | 1000 | 366.97 ns | 4.445 ns | 3.940 ns | 0.98 | 0.02 | 0 |
List_Get | 1000 | 374.97 ns | 3.053 ns | 2.550 ns | 1.00 | 0.00 | 0 |
Native_Get | 10000 | 3,674.04 ns | 60.222 ns | 56.331 ns | 0.98 | 0.02 | 5 |
List_Get | 10000 | 3,733.41 ns | 35.220 ns | 32.945 ns | 1.00 | 0.00 | 4 |
Native_Set | 100 | 56.57 ns | 0.357 ns | 0.279 ns | 0.34 | 0.00 | 0 |
List_Set | 100 | 164.29 ns | 0.936 ns | 0.875 ns | 1.00 | 0.00 | 0 |
Native_Set | 1000 | 493.52 ns | 8.668 ns | 8.108 ns | 0.30 | 0.00 | 1 |
List_Set | 1000 | 1,665.81 ns | 10.073 ns | 7.865 ns | 1.00 | 0.00 | 2 |
Native_Set | 10000 | 4,984.44 ns | 94.866 ns | 88.737 ns | 0.30 | 0.01 | 10 |
List_Set | 10000 | 16,699.32 ns | 59.226 ns | 49.456 ns | 1.00 | 0.00 | 18 |
public unsafe struct NativeList<T> : IDisposable where T : unmanaged {
private static readonly int MinCapacity = UnsafeUtils.CacheLineSize / sizeof(T);
private T* _buffer;
private int _capacity;
private int _count;
public int Capacity => _capacity;
public int Count => _count;
public T this[int index] {
get {
if((uint)index >= (uint)_count) throw new ArgumentOutOfRangeException(nameof(index));
return _buffer[index];
}
set {
if((uint)index >= (uint)_count) throw new ArgumentOutOfRangeException(nameof(index));
_buffer[index] = value;
}
}
public NativeList() {
}
public NativeList(int capacity) {
if(capacity != 0) Resize(capacity);
}
public void Add(T value) {
EnsureCapacity(1);
_buffer[_count] = value;
_count++;
}
public void AddRange(T* values, int count) {
if(count == 0) return;
EnsureCapacity(count);
UnsafeUtils.Copy<T>(values, 0, _buffer, _count, count);
_count += count;
}
public bool Remove(T value) {
var index = IndexOf(value);
if(index < 0) return false;
RemoveAt(index);
return true;
}
public void RemoveAt(int index) {
if(index >= _count) throw new ArgumentOutOfRangeException(nameof(index));
_count--;
if(_count != index) {
UnsafeUtils.Copy<T>(_buffer, index + 1, _buffer, index, _count - index);
}
}
public void RemoveRangeAt(int index, int count) {
if(index < 0 || index >= _count) throw new ArgumentOutOfRangeException(nameof(index));
if(count < 0 || index + count > _count) throw new ArgumentOutOfRangeException(nameof(count));
if(count == 0) return;
_count -= count;
if(_count != index) {
UnsafeUtils.Copy<T>(_buffer, index + count, _buffer, index, _count - index);
}
_count -= count;
}
public bool Contains(T value) {
return IndexOf(value, EqualityComparer<T>.Default) >= 0;
}
public bool Contains(T value, IEqualityComparer<T> equalityComparer) {
return IndexOf(value, equalityComparer) >= 0;
}
public int IndexOf(T value) {
return IndexOf(value, EqualityComparer<T>.Default);
}
public int IndexOf(T value, IEqualityComparer<T> equalityComparer) {
for(var i = 0; i < _count; i++) {
if(equalityComparer.Equals(((T*)_buffer)[i], value)) return i;
}
return -1;
}
public void Clear() {
_count = 0;
}
public Span<T> AsSpan() {
return UnsafeUtils.CreateSpan<T>(_buffer, _count);
}
public Enumerator GetEnumerator() {
return new Enumerator(_buffer, _count);
}
private void EnsureCapacity(int count) {
var minCapacity = _count + count;
if(minCapacity > _capacity) Resize(minCapacity);
}
private void Resize(int capacity) {
capacity = Math.Max(MinCapacity, (int)BitOperations.RoundUpToPowerOf2((uint)capacity));
_buffer = (T*)UnsafeUtils.AlignedRealloc<T>(_buffer, capacity);
_capacity = capacity;
}
public void Dispose() {
if(_buffer is null) return;
UnsafeUtils.AlignedFree(_buffer);
_capacity = 0;
_buffer = null;
}
public struct Enumerator : IEnumerator<T> {
private readonly void* _buffer;
private readonly int _count;
private int _index;
public T Current { get; private set; }
object IEnumerator.Current => Current;
public Enumerator(void* buffer, int count) {
_buffer = buffer;
_count = count;
_index = -1;
}
public bool MoveNext() {
if(++_index >= _count) return false;
Current = ((T*)_buffer)[_index];
return true;
}
public void Reset() {
_index = -1;
}
public void Dispose() {}
}
}
[SimpleJob]
[MemoryDiagnoser]
[HardwareCounters(HardwareCounter.CacheMisses)]
[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory, BenchmarkLogicalGroupRule.ByParams)]
public class NativeListBenchmark {
private NativeList<int> _fullNativeList = new();
private List<int> _fullList = new();
private Random _random;
private NativeList<int> _nativeList = new();
private List<int> _list = new();
[Params(100, 1000, 10_000)]
public int Size { get; set; }
[GlobalSetup]
public void Setup() {
_random = new Random(1337);
for(var i = 0; i < Size; i++) {
_fullNativeList.Add(i);
}
for(var i = 0; i < Size; i++) {
_fullList.Add(i);
}
}
[GlobalCleanup]
public void Cleanup() {
_nativeList.Dispose();
_fullNativeList.Dispose();
}
[Benchmark]
[BenchmarkCategory("Add")]
public void Native_Add() {
_nativeList.Clear();
for(var i = 0; i < Size; i++) {
_nativeList.Add(_random.Next());
}
}
[Benchmark]
[BenchmarkCategory("Get")]
public void Native_Get() {
for(var i = 0; i < Size; i++) {
_ = _fullNativeList[i];
}
}
[Benchmark]
[BenchmarkCategory("Set")]
public void Native_Set() {
for(var i = 0; i < Size; i++) {
_fullNativeList[i] = i;
}
}
[Benchmark(Baseline = true)]
[BenchmarkCategory("Add")]
public void List_Add() {
_list.Clear();
for(var i = 0; i < Size; i++) {
_list.Add(_random.Next());
}
}
[Benchmark(Baseline = true)]
[BenchmarkCategory("Get")]
public void List_Get() {
for(var i = 0; i < Size; i++) {
_ = _fullList[i];
}
}
[Benchmark(Baseline = true)]
[BenchmarkCategory("Set")]
public void List_Set() {
for(var i = 0; i < Size; i++) {
_fullList[i] = i;
}
}
}
Judging by the List<T>
source code, you may get a significant speedup by judicious use of AggressiveInlining
and NoInlining
. I don't have specific info on where to use it and where not, but it seems it is being used to inline the common paths, while keeping the uncommon paths in separate functions.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void EnsureCapacity(int count) {
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Add(T value) {
[MethodImpl(MethodImplOptions.NoInlining)]
private void Resize(int capacity) {
Having said all that, I echo what others have said, that you are
List
.using
), buffer overruns and other issues, for little gain.Span<T>
to store the _buffer
, for little performance overhead (with the right placement of bounds checks so the Jitter can elide them). Copy
in particular has very good optimization when using spans, because it unrolls it and uses SIMD instructions where possible.ListPool
, which would be the classic way of improving memory handling in games.UnsafeUtils
does, but there is already functionality for assigning and freeing aligned memory. See also Create Memory Aligned Array in C#.