Search code examples
javascriptperformanceweb-worker

Workers in javascript not so fast


I am giving a try to workers in js and I tried to make a simple sort using the same js sort function. The comparison i am making is just using an async function which will sort 60000 random numbers. The first will sort the random numbers as traditionally we are used to do it.

async function normalSort(arr) {
    return new Promise((res) => {
        let copy = arr;
        copy.sort((a, b) => a > b ? 1 : -1);
        return res(copy)
    })
}

the other is a normal function which will be called for a workersHandler function

const { Worker, parentPort, workerData } = require('worker_threads');

function sort(data) {
    let copy = data;
    copy.sort((a, b) => a > b ? 1 : -1);
    parentPort.postMessage(copy)
    process.exit();
}


sort(workerData); 

the workers handler function

const os = require('os');
const path = require('path');
const { Worker } = require('worker_threads');

async function workersHandler(arr) {
    const startTime = Date.now();
    const cpusAmount = os.cpus().length;
    const chSize = Math.ceil(arr.length / cpusAmount)
    let promises = [];
    for (let i = 0; i < arr.length; i += chSize) {
        const end = i + chSize;
        const currentChunk = arr.slice(i, end);
        const promise = new Promise((res, rej) => {
            //@ts-ignore
            const worker = new Worker(path.join(__dirname, '..', '/utils/sort.js'), { workerData: currentChunk })

            worker.on('message', res)
            worker.on('error', rej)
        })
        promises.push(promise);
    }
    let result = await Promise.all(promises)
    return result;
}

and the main function which will call the others functions

function main() {
    let arr = new Array(60000).fill(0).map((_, i) => Math.round(Math.random() * 100));
    const startTime = Date.now();

    workersHandler(arr).then(r => console.log('workers sort', Date.now() - startTime + ' ms'))
    normalSort(arr).then(r => console.log('normal sort', Date.now() - startTime + ' ms'))
}
main();

Surprisingly the normal sort function is way faster and is working in one thread. I am receiving for the workers function 101 ms for the normal sort function 53 ms Someone could explain me why these weird results?. Are workers not so fast or I am making a wrong implementation?.


Solution

  • https://nodejs.org/api/worker_threads.html#worker_threads_port_postmessage_value_transferlist and https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage:

    postMessage(value[, transferList])
    node: transferList may be a list of ArrayBuffer and MessagePort objects. After transferring, they will not be usable on the sending side of the channel anymore (even if they are not contained in value). MDN: An optional array of Transferable objects to transfer ownership of. If the ownership of an object is transferred, it becomes unusable (neutered) in the context it was sent from and becomes available only to the worker it was sent to. Transferable objects are instances of classes like ArrayBuffer, MessagePort or ImageBitmap objects that can be transferred.

    Effect of types:

    let typ=prompt("Type: 0/1/2/3 (Array/Float64Array/Float32Array/Uint32Array)");
    let len=parseInt(prompt("Length"));
    let basearray;
    switch(typ){
      case "1":basearray=new Float64Array(len);break;
      case "2":basearray=new Float32Array(len);break;
      case "3":basearray=new Uint32Array(len);break;
      default: basearray=new Array(len);break;
    }
    for(let i=0;i<basearray.length;i++)
      basearray[i]=Math.random()*0x1000000;
    
    let cpus=4,
        chunksize=basearray.length/cpus,
        chunks=[],chunksw=[];
    for(let i=0;i<cpus;i++)
      chunksw[i]=(chunks[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice();
    
    let start=Date.now();
    for(let i=0;i<cpus;i++)
      chunks[i].sort((a,b)=>a-b);
    console.log("Seq:",Date.now()-start);
    
    let code="onmessage=event=>postMessage(event.data.sort((a,b)=>a-b));";
    let ws=[],cnt=0;
    for(let i=0;i<cpus;i++){
      ws[i]=new Worker("data:text/plain,"+escape(code));
      let j=i;
      ws[i].onmessage=event=>{
        chunksw[j]=event.data;
        if(++cnt===cpus){
          console.log("Par:",Date.now()-start);
          if(len<=20)
            for(let i=0;i<cpus;i++)
              console.log(chunks[i],chunksw[i]);
        }
      };
    }
    start=Date.now();
    for(let i=0;i<cpus;i++)
      ws[i].postMessage(chunksw[i]);

    Specify a length divisible by 4. If length is 20 or less, the resulting sorted chunks are going to be logged too for verification purposes. JS Array-s are reliably slower for me when passed around (compared to the thread-less run), regardless of containing 20 or 6000000 elements (while a 6-million-element JS array runs for 8 seconds for me on an older laptop, it still may be safer to start with something less). The other types are faster when threaded, Uint being the fastest.
    Actually anything which is not 1/2/3 is going to result in a JS Array (the slowest one), including the empty string.

    Effect of transfer is not that spectacular, but already appears from the beginning (with 4 elements it is 59-69 ms vs 20-22 ms on my PC):

    let typ=prompt("Type: 0/1/2 (Float64Array/Float32Array/Uint32Array)");
    let len=parseInt(prompt("Length"));
    let basearray;
    switch(typ){
      case "1":basearray=new Float32Array(len);break;
      case "2":basearray=new Uint32Array(len);break;
      default:basearray=new Float64Array(len);
    }
    for(let i=0;i<basearray.length;i++)
      basearray[i]=Math.random()*0x1000000;
    
    let cpus=4,
        chunksize=basearray.length/cpus,
        chunksw=[],chunkswt=[];
    for(let i=0;i<cpus;i++)
      chunkswt[i]=(chunksw[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice();
    
    let start;
    let code="onmessage=event=>postMessage(event.data.sort((a,b)=>a-b));";
    let ws=[],cnt=0;
    for(let i=0;i<cpus;i++){
      ws[i]=new Worker("data:text/plain,"+escape(code));
      let j=i;
      ws[i].onmessage=event=>{
        chunksw[j]=event.data;
        if(++cnt===cpus){
          console.log("Non-transfer:",Date.now()-start);
          // launch transfer measurement
          cnt=0;start=Date.now();
          for(let i=0;i<cpus;i++)
            wst[i].postMessage(chunkswt[i].buffer,[chunkswt[i].buffer]);    }
      };
    }
    
    let codet;
    switch(typ){
      case "1":
        codet="onmessage=event=>{"+
              "let arr=new Float32Array(event.data);"+
              "arr.sort((a,b)=>a-b);"+
              "postMessage(event.data,[event.data]);};";
        break;
      case "2":
        codet="onmessage=event=>{"+
              "let arr=new Uint32Array(event.data);"+
              "arr.sort((a,b)=>a-b);"+
              "postMessage(event.data,[event.data]);};";
        break;
      default:
        codet="onmessage=event=>{"+
              "let arr=new Float64Array(event.data);"+
              "arr.sort((a,b)=>a-b);"+
              "postMessage(event.data,[event.data]);};";
    }
    let wst=[];
    for(let i=0;i<cpus;i++){
      wst[i]=new Worker("data:text/plain,"+escape(codet));
      let j=i;
      wst[i].onmessage=event=>{
        switch(typ){
          case "1":chunkswt[j]=new Float32Array(event.data);break;
          case "2":chunkswt[j]=new Uint32Array(event.data);break;
          default:chunkswt[j]=new Float64Array(event.data);
        }
        if(++cnt===cpus){
          console.log("Transfer:",Date.now()-start);
          if(len<=20)
            for(let i=0;i<cpus;i++)
              console.log(chunksw[i],chunkswt[i]);
        }
      };
    }
    
    // launch non-transfer measurement
    start=Date.now();
    for(let i=0;i<cpus;i++)
      ws[i].postMessage(chunksw[i]);

    This code is a bit messy because it is the buffer which can be transferred, not the typed arrays themselves, and also, while the second measurement is initialized as a direct copy-paste (which already isn't that pretty), it is then launched from inside the completion function of the first one.

    (I do not wish to provide exact measurement results because my PC is doing some other things too. Just run the snippets a couple times with varied or even repeated parameters)