Search code examples
performancerustmutexchannelrust-tokio

Why is the performance of rust tokio so poor? [release test result updated]


The following scenarios are frequently used in asynchronous programming.

  • channel tx/rx;
  • mutex lock/unlock;
  • async task spawn;

So I ran some comparison tests on a lower performance cloud host (equivalent to j1900) as follows. I found that the performance of rust-tokio is very, very poor compared to go-lang.

Is there any parameter that needs to be adjusted? Can a single thread executor improve it?

Results.

  • tx/rx, time per op: go-lang: 112 ns,

    tokio::sync::mpsc::channel: 7387 ns;

    std::sync::channel: 2705 ns,

    crossbean: 1062 ns.

  • mutex lock/unlock, per op:

    tokio::sync::Mutex 4051 ns

    std::sync::Mutex 321 ns

  • spawn (not join), per op:

    tokio::spawn: 8445 ns

Rust tokio test tx/rx on channel

    #[tokio::test]
    async fn test_chan_benchmark() {
        let count = 100_000;
        let (tx, mut rx) = tokio::sync::mpsc::channel(10000);
        let start = std::time::SystemTime::now();
        let handle = tokio::spawn(async move {
            loop {
                let i = rx.recv().await.unwrap();
                if i == count - 1 {
                    break;
                }
            }
        });

        for i in 0..count {
            tx.send(i).await.unwrap();
        }
        drop(tx);

        handle.await.unwrap();
        let stop = std::time::SystemTime::now();
        let dur = stop.duration_since(start).unwrap();
        println!(
            "count={count}, cosume={}ms, ops={}ns",
            dur.as_millis(),
            dur.as_nanos() / count as u128,
        );
    }

Go channel tx/rx:

func TestChanPerformance(t *testing.T) {
    count := 1000000
    ch := make(chan int, count)
    rsp := make(chan int, 1)
    t1 := time.Now()
    go func() {
        for {
            if _, ok := <-ch; !ok {
                rsp <- 0
                break
            }
        }
    }()
    for i := 0; i < count; i++ {
        ch <- i
    }
    close(ch)
    <-rsp

    d := time.Since(t1)
    t.Logf("txrx %d times consumed %d ms, %d nspo", count, d.Milliseconds(), d.Nanoseconds()/int64(count))
}

Mutex test:

    #[tokio::test]
    async fn bench_std_mutex() {
        for count in [1_000, 10_000, 100_000] {
            let start = std::time::SystemTime::now();

            let under = Arc::new(std::sync::Mutex::new(0));
            for _ in 0..count {
                let _ = under.lock().unwrap();
            }

            let stop = std::time::SystemTime::now();
            let dur = stop.duration_since(start).unwrap();
            println!(
                "count={count}, cosume={}ms, ops={}ns",
                dur.as_millis(),
                dur.as_nanos() / count as u128,
            );
        }
    }

Tokio spawn test:

    #[tokio::test]
    async fn bench_tokio_spawn() {
        let count = 100_000;
        //let mut ths = Vec::with_capacity(count);
        let start = std::time::SystemTime::now();
        for _ in 0..count {
            tokio::spawn(async move {});
        }
        let stop = std::time::SystemTime::now();
        let dur = stop.duration_since(start).unwrap();
        //for _ in 0..count {
        //    ths.pop().unwrap().await.unwrap();
        //}
        // do not wait for join, just spawn
        println!(
            "count={count}, cosume={}ms, ops={}ns",
            dur.as_millis(),
            dur.as_nanos() / count as u128,
        );
    }

=============UPDATED=========== For --release:

std::mpsc::Mutex: 13ns;
tokio::mpsc::Mutex: 130ns;
std::mpsc::channel: 200ns;
tokio::mpsc::channel: 256ns;
tokio::spawn: 553ns;

Solution

  • Add --release to instruct the compiler to perform optimizations.

    To demonstrate just how much of a difference this makes, here is a simple add function compiled with and without optimizations:

    pub fn add(a: u32, b: u32) -> u32 {
        a + b
    }
    
    example::add:
            lea     eax, [rdi + rsi]
            ret
    
    example::add:
            push    rax
            add     edi, esi
            mov     dword ptr [rsp + 4], edi
            setb    al
            test    al, 1
            jne     .LBB0_2
            mov     eax, dword ptr [rsp + 4]
            pop     rcx
            ret
    .LBB0_2:
            lea     rdi, [rip + str.0]
            lea     rdx, [rip + .L__unnamed_1]
            mov     rax, qword ptr [rip + core::panicking::panic@GOTPCREL]
            mov     esi, 28
            call    rax
            ud2
    
    .L__unnamed_2:
            .ascii  "/app/example.rs"
    
    .L__unnamed_1:
            .quad   .L__unnamed_2
            .asciz  "\017\000\000\000\000\000\000\000\002\000\000\000\005\000\000"
    
    str.0:
            .ascii  "attempt to add with overflow"
    

    Note that the optimized version does no longer contain an overflow check. The overflow check is very useful during debugging, but also very slow.