why golang channel is much faster than intel tbb concurrent_queue when test with 8 producer 1 consumer

I did a test to compare golang channel and C++ tbb concurrent queue performance, I setup 8 writer and 1 reader which are in different threads.the result shows golang is much faster than C++ version(whatever latency and overall send/recv speed), is it true? or any mistake in my code?

golang result, unit is microsecond

latency max:1505,avg:1073 send begin:1495593677683232,recv end:1495593677901854, time:218622

package main

import (
    "flag"
    "time"
    "fmt"
    "sync"
    "runtime"
)

var (
    producer = flag.Int("producer", 8, "producer")
    consumer = flag.Int("consumer", 1, "consumer")
    start_signal sync.WaitGroup
)

const (
    TEST_NUM = 1000000
)

type Item struct  {
    id int
    sendtime int64
    recvtime int64
}

var g_vec[TEST_NUM] Item

func sender(out chan int, begin int, end int) {
    start_signal.Wait()
    runtime.LockOSThread()
    println("i am in sender", begin, end)
    for i:=begin; i < end; i++ {
        item := &g_vec[i]
        item.id = i
        item.sendtime = time.Now().UnixNano()/1000
        out<- i
    }
    println("sender finish")
}

func reader(out chan int, total int) {
    //runtime.LockOSThread()
    start_signal.Done()
    for i:=0; i<total;i++ {
        tmp :=<- out
        item := &g_vec[tmp]
        item.recvtime = time.Now().UnixNano()/1000
    }
    var lsum int64 = 0
    var lavg int64 = 0
    var lmax int64 = 0
    var lstart int64 = 0
    var lend int64 = 0
    for _, item:= range g_vec {
        if lstart > item.sendtime || lstart == 0 {
           lstart = item.sendtime
        }
        if lend < item.recvtime {
            lend = item.recvtime
        }

        ltmp := item.recvtime - item.sendtime
        lsum += ltmp
        if  ltmp > lmax {
            lmax = ltmp
        }
    }
    lavg = lsum /  TEST_NUM
    fmt.Printf("latency max:%v,avg:%v\n", lmax, lavg)
    fmt.Printf("send begin:%v,recv end:%v, time:%v", lstart, lend, lend-lstart)
}

func main() {
    runtime.GOMAXPROCS(10)
    out := make (chan int,5000)

    start_signal.Add(1)
    for i:=0 ;i<*producer;i++ {
        go sender(out,i*TEST_NUM/(*producer), (i+1)*TEST_NUM/(*producer))
    }
    reader(out, TEST_NUM)
}

C++, only main part

concurrent_bounded_queue g_queue; max:558301,min:3,avg:403741 (unit is microsecond) start:1495594232068580,end:1495594233497618,length:1429038

static void sender(int start, int end)
{   
    for (int i=start; i < end; i++)
    {
        using namespace std::chrono;
        auto now = system_clock::now();
        auto now_ms = time_point_cast<microseconds>(now);
        auto value = now_ms.time_since_epoch();
        int64_t duration = value.count();

        Item &item = g_pvec->at(i);
        item.id = i;
        item.sendTime = duration;
        //std::cout << "sending " << i << "\n";
        g_queue.push(i);
    }
}


static void reader(int num)
{
    barrier.set_value();    
    for (int i=0;i<num;i++)
    {
        int v;
        g_queue.pop(v);
        Item &el = g_pvec->at(v);

        using namespace std::chrono;
        auto now = system_clock::now();
        auto now_ms = time_point_cast<microseconds>(now);
        auto value = now_ms.time_since_epoch();
        int64_t duration = value.count();

        el.recvTime = duration;
        //std::cout << "recv " << item.id << ":" << duration << "\n";
    }
    // caculate the result.
    int64_t lmax = 0;
    int64_t lmin = 100000000;
    int64_t lavg = 0;
    int64_t lsum = 0;
    int64_t lbegin = 0;
    int64_t lend = 0;
    for (auto &item : *g_pvec)
    {
        if (item.sendTime<lbegin || lbegin==0)
        {
            lbegin = item.sendTime;
        }
        if (item.recvTime>lend )
        {
            lend = item.recvTime;
        }

        lsum += item.recvTime - item.sendTime;
        lmax = max(item.recvTime - item.sendTime, lmax);
        lmin = min(item.recvTime - item.sendTime, lmin);
    }
    lavg = lsum / num;
    std::cout << "max:" << lmax << ",min:" << lmin << ",avg:" << lavg << "\n";
    std::cout << "start:" << lbegin << ",end:" << lend << ",length:" << lend-lbegin << "\n";
}

DEFINE_CODE_TEST(plain_queue_test)
{
    g_pvec = new std::vector<Item>();
    g_pvec->resize(TEST_NUM);    

    auto sf = barrier.get_future().share();

    std::vector<std::thread> vt;
    for (int i = 0; i < SENDER_NUM; i++)
    {
        vt.emplace_back([sf, i]{    
            sf.wait();
            sender(i*TEST_NUM / SENDER_NUM, (i + 1)*TEST_NUM / SENDER_NUM);
        });
    }


    std::cout << "create reader\n";
    std::thread rt(bind(reader, TEST_NUM));
    for (auto& t : vt)
    {
        t.join();
    }    
    rt.join();
}

(red color means cpu spin/overhead, green is idle) from the vtune cpu graph I felt golang channel has a more efficient mutex(e.g. does it need a system call to sleep a goroutine vs C++ mutex?)

Solution

From the VTune traces I can conclude that TBB queue does not sleep, spending a lot time spinning, while Go version has light green areas indicating threads sleeping on OS synchronization. Why is it better? Usually, it indicates that you have oversubscription on your machine, thus communication through OS pays off. So, do you oversubscribe it? If yes, I'd say this is rather expected behavior which fits into philosophy of corresponding libraries. TBB is designed for compute parallelism, it does not handle IO tasks well while playing against oversubscription. Go is designed exactly for IO tasks, thus built-in concurrency with FIFO policy of the scheduler that is unfriendly to parallel number crunching. Oversubscription is rather recommend for IO tasks while it affects or even kills computational parallelism.