c#unity-game-engine shader hlsl compute-shader

The difference in the speed of moving objects through the CPU and GPU shader in Unity

I have been testing moving a lot of objects in Unity through normal C# code and through HLSL shaders. However, there is no difference in speed. FPS remains the same. Different perlin noise is used to change the position. The C# code uses the standard Mathf.PerlinNoise, while the HLSL uses a custom noise function.

Scenario 1 - Update via C# code only

Object spawn:

[SerializeField]
private GameObject prefab;

private void Start()
{
    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
        }
}

Code to move an object via C#. This script is added to each created object:

private Vector3 position = new Vector3();

private void Start()
{
    position = new Vector3(transform.position.x, Mathf.PerlinNoise(Time.time, Time.time), transform.position.z);
}

private void Update()
{
    position.y = Mathf.PerlinNoise(transform.position.x / 20f + Time.time, transform.position.z / 20f + Time.time) * 5f;
    transform.position = position;
}

Scenario 2 - via Compute Kernel (GPGPU)

Part 1: C# client code

Object spawn, running the calculation on the shader and assigning the resulting value to the objects:

public struct Particle
{
    public Vector3 position;
}

[SerializeField]
private GameObject prefab;
[SerializeField]
private ComputeShader computeShader;

private List<GameObject> particlesList = new List<GameObject>();
private Particle[] particlesDataArray;

private void Start()
{
    CreateParticles();
}

private void Update()
{
    UpdateParticlePosition();
}

private void CreateParticles()
{
    List<Particle> particlesDataList = new List<Particle>();

    for (int i = 0; i < 50; i++)
        for (int j = 0; j < 50; j++)
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
            particlesList.Add(createdParticle);
            Particle particle = new Particle();
            particle.position = createdParticle.transform.position;
            particlesDataList.Add(particle);
        }

    particlesDataArray = particlesDataList.ToArray();
    particlesDataList.Clear();
    computeBuffer = new ComputeBuffer(particlesDataArray.Length, sizeof(float) * 7);
    computeBuffer.SetData(particlesDataArray);
    computeShader.SetBuffer(0, "particles", computeBuffer);
}

private ComputeBuffer computeBuffer;
private void UpdateParticlePosition()
{
    computeShader.SetFloat("time", Time.time);
    computeShader.Dispatch(computeShader.FindKernel("CSMain"), particlesDataArray.Length / 10, 1, 1);
    computeBuffer.GetData(particlesDataArray);

    for (int i = 0; i < particlesDataArray.Length; i++)
    {
        Vector3 pos = particlesList[i].transform.position;
        pos.y = particlesDataArray[i].position.y;
        particlesList[i].transform.position = pos;
    }
}

Part 2: Compute kernel (GPGPU)

#pragma kernel CSMain

struct Particle {
    float3 position;
    float4 color;
};

RWStructuredBuffer<Particle> particles;
float time;

float mod(float x, float y)
{
    return x - y * floor(x / y);
}

float  permute(float x) { return floor(mod(((x * 34.0) + 1.0) * x, 289.0)); }
float3 permute(float3 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float4 permute(float4 x) { return mod(((x * 34.0) + 1.0) * x, 289.0); }
float taylorInvSqrt(float r) { return 1.79284291400159 - 0.85373472095314 * r; }
float4 taylorInvSqrt(float4 r) { return float4(taylorInvSqrt(r.x), taylorInvSqrt(r.y), taylorInvSqrt(r.z), taylorInvSqrt(r.w)); }

float3 rand3(float3 c) {
    float j = 4096.0 * sin(dot(c, float3(17.0, 59.4, 15.0)));
    float3 r;
    r.z = frac(512.0 * j);
    j *= .125;
    r.x = frac(512.0 * j);
    j *= .125;
    r.y = frac(512.0 * j);
    return r - 0.5;
}

float _snoise(float3 p) {
    const float F3 = 0.3333333;
    const float G3 = 0.1666667;
    float3 s = floor(p + dot(p, float3(F3, F3, F3)));
    float3 x = p - s + dot(s, float3(G3, G3, G3));

    float3 e = step(float3(0.0, 0.0, 0.0), x - x.yzx);
    float3 i1 = e * (1.0 - e.zxy);
    float3 i2 = 1.0 - e.zxy * (1.0 - e);

    float3 x1 = x - i1 + G3;
    float3 x2 = x - i2 + 2.0 * G3;
    float3 x3 = x - 1.0 + 3.0 * G3;

    float4 w, d;

    w.x = dot(x, x);
    w.y = dot(x1, x1);
    w.z = dot(x2, x2);
    w.w = dot(x3, x3);

    w = max(0.6 - w, 0.0);

    d.x = dot(rand3(s), x);
    d.y = dot(rand3(s + i1), x1);
    d.z = dot(rand3(s + i2), x2);
    d.w = dot(rand3(s + 1.0), x3);

    w *= w;
    w *= w;
    d *= w;

    return dot(d, float4(52.0, 52.0, 52.0, 52.0));
}

[numthreads(10, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Particle particle = particles[id.x];
    float modifyTime = time / 5.0;
    float positionY = _snoise(float3(particle.position.x / 20.0 + modifyTime, 0.0, particle.position.z / 20.0 + modifyTime)) * 5.0;

    particle.position = float3(particle.position.x, positionY, particle.position.z);
    particles[id.x] = particle;
}

What am I doing wrong, why is there no increase in calculation speed? :)

Thanks in advance!

Solution

TL;DR: your GPGPU (compute shader) scenario is unoptimized thus skewing your results. Consider binding a material to the computeBuffer and rendering via Graphics.DrawProcedural. That way everything stays on the GPU.

OP:

What am I doing wrong, why is there no increase in calculation speed?

Essentially, there are two parts to your problem.

(1) Reading from the GPU is slow

With most things GPU-related, you generally want to avoid reading from the GPU since it will block the CPU. This is true also for GPGPU scenarios.

If I were to hazard a guess it would be the GPGPU (compute shader) call computeBuffer.GetData() shown below:

private void Update()
{
    UpdateParticlePosition();
}

private void UpdateParticlePosition()
{
.
.
.
    computeBuffer.GetData(particlesDataArray); // <----- OUCH!

Unity (my emphasis):

ComputeBuffer.GetData

Read data values from the buffer into an array...
Note that this function reads the data back from the GPU, which can be slow...If any GPU work has been submitted that writes to this buffer, Unity waits for the tasks to complete before it retrieves the requested data. Tell me more...

(2) Explicit GPU reading is not required in your scenario

I can see you are creating 2,500 "particles" where each particle is attached to a GameObject. If the intent is to just draw a simple quad then it's more efficient to create an array structs containing a Vector3 position and then performing a batch render call to draw all the particles in one go.

Proof: see video below of nBody simulation. 60+ FPS on 2014 era NVidia card

e.g. for my GPGPU n-Body Galaxy Simulation I do just that. Pay attention to the StarMaterial.SetBuffer("stars", _starsBuffer) during actual rendering. That tells the GPU to use the buffer that already exists on the GPU, the very same buffer that the computer shader used to move the star positions. There is no CPU reading the GPU here.

public class Galaxy1Controller : MonoBehaviour
{
    public Texture2D HueTexture;

    public int NumStars = 10000; // That's right! 10,000 stars!

    public ComputeShader StarCompute;
    public Material StarMaterial;
    private ComputeBuffer _quadPoints;
    private Star[] _stars;
    private ComputeBuffer _starsBuffer;
.
.
.
    private void Start()
    {
        _updateParticlesKernel = StarCompute.FindKernel("UpdateStars");
        _starsBuffer = new ComputeBuffer(NumStars, Constants.StarsStride);

        _stars = new Star[NumStars];
        // Create initial positions for stars here (not shown)
        _starsBuffer.SetData(_stars);

        _quadPoints = new ComputeBuffer(6, QuadStride);
        _quadPoints.SetData(...); // star quad      
    }

    private void Update()
    {
        // bind resources to compute shader
        StarCompute.SetBuffer(_updateParticlesKernel, "stars", _starsBuffer);
        StarCompute.SetFloat("deltaTime", Time.deltaTime*_manager.MasterSpeed);
        StarCompute.SetTexture(_updateParticlesKernel, "hueTexture", HueTexture);

        // dispatch, launch threads on GPU
        var numberOfGroups = Mathf.CeilToInt((float) NumStars/GroupSize);
        StarCompute.Dispatch(_updateParticlesKernel, numberOfGroups, 1, 1);

        // "Look Ma, no reading from the GPU!"
    }

    private void OnRenderObject()
    {
        // bind resources to material
        StarMaterial.SetBuffer("stars", _starsBuffer);
        StarMaterial.SetBuffer("quadPoints", _quadPoints);

        // set the pass
        StarMaterial.SetPass(0);

        // draw
        Graphics.DrawProcedural(MeshTopology.Triangles, 6, NumStars);
    }
}

n-Body galaxy simulation of 10,000 stars:

I think everyone can agree that Microsoft's GPGPU documentation is pretty sparse so your best bet is to check out examples scattered around the interwebs. One that comes to mind is the excellent "GPU Ray Tracing in Unity" series over at Three Eyed Games. See the link below.

The difference in the speed of moving objects through the CPU and GPU shader in Unity

Scenario 1 - Update via C# code only

Scenario 2 - via Compute Kernel (GPGPU)

(1) Reading from the GPU is slow

(2) Explicit GPU reading is not required in your scenario

See also: