Benchmark Code:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Mono.Simd;
using MathNet.Numerics.LinearAlgebra.Single;
namespace XXX {
public static class TimeSpanExtensions {
public static double TotalNanoseconds(this TimeSpan timeSpan) {
return timeSpan.TotalMilliseconds * 1000000.0;
}
}
public sealed class SimdBenchmark : Benchmark {
Vector4f a = new Vector4f(1.0f, 2.0f, 3.0f, 4.0f);
Vector4f b = new Vector4f(1.0f, 2.0f, 3.0f, 4.0f);
Vector4f c;
public override void Do() {
c = a + b;
}
}
public sealed class MathNetBenchmark : Benchmark {
DenseVector a = new DenseVector(new float[]{1.0f,2.0f,3.0f,4.0f});
DenseVector b = new DenseVector(new float[]{1.0f,2.0f,3.0f,4.0f});
DenseVector c;
public override void Do() {
c = a + b;
}
}
public sealed class DefaultBenchmark : Benchmark {
Vector4 a = new Vector4(1.0f, 2.0f, 3.0f, 4.0f);
Vector4 b = new Vector4(1.0f, 2.0f, 3.0f, 4.0f);
Vector4 c;
public override void Do() {
c = a + b;
}
}
public sealed class SimpleBenchmark : Benchmark {
float a = 1.0f;
float b = 2.0f;
float c;
public override void Do() {
c = a + b;
}
}
public sealed class DelegateBenchmark : Benchmark {
private readonly Action _action;
public DelegateBenchmark(Action action) {
_action = action;
}
public override void Do() {
_action();
}
}
public abstract class Benchmark : IEnumerable<TimeSpan> {
public IEnumerator<TimeSpan> GetEnumerator() {
Do(); // Warm-up!
GC.Collect(); // Collect garbage.
GC.WaitForPendingFinalizers(); // Wait until finalizers finish.
var stopwatch = new Stopwatch();
while (true) {
stopwatch.Reset();
stopwatch.Start();
Do();
stopwatch.Stop();
yield return stopwatch.Elapsed;
}
}
IEnumerator IEnumerable.GetEnumerator() {
return GetEnumerator();
}
public abstract void Do();
}
public struct Vector4 {
float x;
float y;
float z;
float w;
public Vector4(float x, float y, float z, float w) {
this.x = x;
this.y = y;
this.z = z;
this.w = w;
}
public static Vector4 operator +(Vector4 v1, Vector4 v2) {
return new Vector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
}
}
class MainClass {
public static void Main(string[] args) {
var avgNS1 = new SimdBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS2 = new SimpleBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS3 = new DefaultBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS4 = new MathNetBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
Console.WriteLine(avgNS1 + " ns");
Console.WriteLine(avgNS2 + " ns");
Console.WriteLine(avgNS3 + " ns");
Console.WriteLine(avgNS4 + " ns");
}
}
}
Environment Setup:
Windows 7 / Mono 2.10.8 / MonoDevelop 2.8.5
MonoDevelop Setup:
Results:
Well, I've managed to modify my benchmark code to make it more robust and completely unbiased. In other words:
First, as we discussed with Nicholas - measuring single operation might give distorted results. Moreover, as the frequency of Stopwatch is 10 million - it means that ticks occur every 100 ns. So considering this fact, previous results look rather bizarre. Therefore, in order to mitigate this issue, I decided to test 1000 operations rather than 1 at time.
Second, I'm not completely sure, but I guess that previous benchmark implementation was subjected to intensive caching, since on every iteration sums were computed between the same vectors (their components never changed). The only straightforward solution I see is to simply rebuild vectors with random components before every test.
The respective benchmark implementation is:
public static class TimeSpanExtensions {
public static double TotalNanoseconds(this TimeSpan timeSpan) {
return timeSpan.TotalMilliseconds * 1000000.0;
}
}
public static class RandomExtensions {
public static float NextFloat(this Random random) {
return (float)random.NextDouble();
}
public static float NextFloat(this Random random, float min, float max) {
return random.NextFloat() * (max - min) + min;
}
}
public sealed class SimdBenchmark : Benchmark {
Vector4f[] a = new Vector4f[1000];
Vector4f[] b = new Vector4f[1000];
Vector4f[] c = new Vector4f[1000];
public override void Begin() {
Random r = new Random();
for (int i = 0; i < 1000; ++i) {
a[i] = new Vector4f(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
b[i] = new Vector4f(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
}
}
public override void Do() {
for (int i = 0; i < 1000; ++i)
c[i] = a[i] + b[i];
}
public override void End() {
}
}
public sealed class MathNetBenchmark : Benchmark {
DenseVector[] a = new DenseVector[1000];
DenseVector[] b = new DenseVector[1000];
DenseVector[] c = new DenseVector[1000];
public override void Begin() {
Random r = new Random();
for (int i = 0; i < 1000; ++i) {
a[i] = new DenseVector(new float[]{r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat()});
b[i] = new DenseVector(new float[]{r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat()});
}
}
public override void Do() {
for (int i = 0; i < 1000; ++i)
c[i] = a[i] + b[i];
}
public override void End() {
}
}
public sealed class DefaultBenchmark : Benchmark {
Vector4[] a = new Vector4[1000];
Vector4[] b = new Vector4[1000];
Vector4[] c = new Vector4[1000];
public override void Begin() {
Random r = new Random();
for (int i = 0; i < 1000; ++i) {
a[i] = new Vector4(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
b[i] = new Vector4(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
}
}
public override void Do() {
for (int i = 0; i < 1000; ++i)
c[i] = a[i] + b[i];
}
public override void End() {
}
}
public sealed class SimpleBenchmark : Benchmark {
float[] a = new float[1000];
float[] b = new float[1000];
float[] c = new float[1000];
public override void Begin() {
Random r = new Random();
for (int i = 0; i < 1000; ++i) {
a[i] = r.NextFloat();
b[i] = r.NextFloat();
}
}
public override void Do() {
for (int i = 0; i < 1000; ++i)
c[i] = a[i] + b[i];
}
public override void End() {
}
}
public sealed class DelegateBenchmark : Benchmark {
private readonly Action _action;
public DelegateBenchmark(Action action) {
_action = action;
}
public override void Begin() {
}
public override void Do() {
_action();
}
public override void End() {
}
}
public abstract class Benchmark : IEnumerable<TimeSpan> {
public IEnumerator<TimeSpan> GetEnumerator() {
Begin();
Do(); // Warm-up!
End();
var stopwatch = new Stopwatch();
while (true) {
Begin();
GC.Collect(); // Collect garbage.
GC.WaitForPendingFinalizers(); // Wait until finalizers finish.
stopwatch.Reset();
stopwatch.Start();
Do();
stopwatch.Stop();
End();
yield return stopwatch.Elapsed;
}
}
IEnumerator IEnumerable.GetEnumerator() {
return GetEnumerator();
}
public abstract void Begin();
public abstract void Do();
public abstract void End();
}
public struct Vector4 {
float x;
float y;
float z;
float w;
public Vector4(float x, float y, float z, float w) {
this.x = x;
this.y = y;
this.z = z;
this.w = w;
}
public static Vector4 operator +(Vector4 v1, Vector4 v2) {
return new Vector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
}
}
class MainClass {
public static void Main(string[] args) {
var avgNS1 = new SimdBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS2 = new SimpleBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS3 = new DefaultBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
var avgNS4 = new MathNetBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
Console.WriteLine(avgNS1 + " ns");
Console.WriteLine(avgNS2 + " ns");
Console.WriteLine(avgNS3 + " ns");
Console.WriteLine(avgNS4 + " ns");
}
}
Results:
I think it confirms that SIMD is on air, because SimdBenchmark is getting close to SimpleBenchmark (as intended by SIMD technology) and is much better than DefaultBenchmark (again as implied by SIMD technology).
Moreover, the results seems consistent with konrad.kruczynski, because the ratio between SimdBenchmark (3203.9) and DefaultBenchmark (20138.4) is about 6 and the ratio between simdVector (5802) and usualVector (29598) is also about 6.
Anyway 2 questions still remain: