How can i make this go code with multiple for loops faster?

I recently started to work on this project . It's a model about collective motion. In this code 8192 particle move in 1000000 steps and in each step each particles position and angle updates. in each step neighbors of each particle sync their angle with that specific particle. I wrote a code in python but that was so slow . then I wrote this code in go but it's still not fast . each step takes average 1.3 s which is not good . do u guys have any idea about making it faster ?

package main

import (
    "bufio"
    "flag"
    "fmt"
    "log"
    "math"
    "math/rand"
    "os"
    "runtime"
    "time"
)

const (
    n_particles int     = 8192
    n_steps     int     = 1000000
    dt          float64 = 1.0
    v0          float64 = 0.5
    radius      float64 = 1.0
    f_intensity float64 = 1.2
    scale       float64 = 128.0

    alpha float64 = 1 / 36
)

var (
    x      [n_particles + 1]float64
    y      [n_particles + 1]float64
    angles [n_particles + 1]float64
    vx     [n_particles + 1]float64
    vy     [n_particles + 1]float64
    order  [n_steps + 1]float64
    fxk    float64
    fyk    float64
    fxi    float64
    fyi    float64
)

func main() {

    
    vstart := time.Now()
    rsource := rand.NewSource(time.Now().UnixNano())
    randomizer := rand.New(rsource)
    //defining random data
    for i := 0; i <= n_particles; {
        x[i] = (randomizer.Float64()) * scale
        y[i] = (randomizer.Float64()) * scale
        angles[i] = (randomizer.Float64()) * math.Pi * 2
        vx[i] = v0 * float64(math.Cos(angles[i]))
        vy[i] = v0 * float64(math.Sin(angles[i]))

        i = i + 1

    }


    for i := 0; i <= n_steps; {

        start := time.Now()
        cosangles := 0.0
        sinangles := 0.0

        for j := 0; j <= n_particles; {

            x[j] = x[j] + (vx[j] * dt)
            x[j] = math.Mod(x[j], scale)
            y[j] = y[j] + (vy[j] * dt)
            y[j] = math.Mod(x[j], scale)

            j = j + 1
        }
        
        m_angles := angles

        for j := 0; j <= n_particles; {
            sumanglesx := 0.0
            sumanglesy := 0.0
            fxi = math.Floor(x[j])
            fyi = math.Floor(y[j])

            for k := 0; k <= n_particles; {
                if k != j {
                    fxk = math.Floor(x[k])
                    fyk = math.Floor(y[k])

                    if (fxi == fxk && fyi == fyk) || (fxi+1 == fxk && fyi == fyk) || (fxi-1 == fxk && fyi == fyk) || (fxi == fxk && fyi+1 == fyk) || (fxi == fxk && fyi-1 == fyk) || (fxi+1 == fxk && fyi+1 == fyk) || (fxi-1 == fxk && fyi-1 == fyk) || (fxi-1 == fxk && fyi+1 == fyk) || (fxi+1 == fxk && fyi-1 == fyk) {

                        dist := math.Pow((x[k]-x[j]), 2) + math.Pow((y[k]-y[j]), 2)
                        if dist < radius {
                            if k > j {
                                sx := math.Cos(angles[k])
                                sy := math.Sin(angles[k])
                                sumanglesx = sumanglesx + sx
                                sumanglesy = sumanglesy + sy

                            } else if k < j {
                                sx := alpha * math.Cos(angles[k])
                                sy := alpha * math.Sin(angles[k])
                                sumanglesx = sumanglesx + sx
                                sumanglesy = sumanglesy + sy

                            }

                        }

                    }

                }

                k = k + 1

            }

            m_angles[j] = math.Atan2(sumanglesx, sumanglesy) + (f_intensity*randomizer.Float64() - f_intensity)

            j = j + 1
        }

        angles = m_angles
        for f := 0; f <= n_particles; {

            vx[f] = v0 * float64(math.Cos(angles[f]))
            vy[f] = v0 * float64(math.Sin(angles[f]))
            f = f + 1
        }

        for h := 0; h <= n_particles; {
            cosangles = cosangles + (math.Cos(angles[h]))
            sinangles = sinangles + (math.Sin(angles[h]))
            h = h + 1
        }

        core := (math.Sqrt(((math.Pow(cosangles, 2)) + (math.Pow(cosangles, 2))))) / float64(n_particles)
        order[i] = core
        duration := time.Since(start)
        fmt.Println(i)
        fmt.Println(duration)
        i = i + 1

    }
    vduration := time.Since(vstart)
    log.Printf("It took : %s", vduration)
    f, _ := os.Create("order.txt")
    for i := 0; i <= n_steps; {

        s := fmt.Sprintf("%f", order[i])

        w := bufio.NewWriter(f)
        w.WriteString(s + "\n")
        w.Flush()

        i = i + 1

    }

}

Solution

First some minor quick wins:

math.Pow(x, 2) => x * x
math.Sin, math.Cos => math.Sincos

But a bigger issue is looping over n_particles², which is wasteful given that apparently only adjacent particles interact:

for j := 0; j <= n_particles; j++ {
    fxi = math.Floor(x[j])
    fyi = math.Floor(y[j])
    for k := 0; k <= n_particles; k++ {
        fxk = math.Floor(x[k])
        fyk = math.Floor(y[k])
        if (fxi == fxk && fyi == fyk) || (fxi+1 == fxk && fyi == fyk)
            || (fxi-1 == fxk && fyi == fyk) || (fxi == fxk && fyi+1 == fyk)
            || (fxi == fxk && fyi-1 == fyk) || (fxi+1 == fxk && fyi+1 == fyk)
            || (fxi-1 == fxk && fyi-1 == fyk) || (fxi-1 == fxk && fyi+1 == fyk)
            || (fxi+1 == fxk && fyi-1 == fyk) {
            // ...

How can you loop over adjacent particles without a N² nested loop? Perhaps if you maintain a helper adjacency index mapping integral x/y coordinates to particles, you can loop over it in a single pass?

for i := 0; i <= n_steps; i++ {
    // . . .
    type intpos struct {
        x, y int64
    }
    adjacencyIndex := make(map[intpos][]int)

    for j := 0; j <= n_particles; j++ {
        // . . .
        ix, iy := int64(math.Floor(x[j])), int64(math.Floor(y[j]))
        adjacencyIndex[intpos{ix, iy}] = append(adjacencyIndex[intpos{ix, iy}], j)
    }
    // . . .
    for j := 0; j <= n_particles; j++ {
        // . . .
        ix, iy := int64(math.Floor(x[j])), int64(math.Floor(y[j]))
        for dx := -1; dx <= 1; dx++ {
            for dy := -1; dy <= 1; dy++ {
                adjacentParticles := adjacencyIndex[intpos{ix + int64(dx), iy + int64(dy)}]
                for _, k := range adjacentParticles {
                    if j < k {
                        // process interaction between particles j and k
                    }
                }
            }
        }
        // . . .

Note that the above will result in a single interaction between each pair of adjacent particles. If you need a bidirectional interaction (i.e. j <=> k and k <=> j), change j < k to j != k.

As a further improvement you can update the adjacency index as particles move around instead of rebuilding it in each iteration.