Search code examples
node.jsdiscord.jsopus

How to convert from prism-media(@discordjs/opus) opus stream to format suitable for picovoice/porcupine?


I have created a discord bot using discord.js and am attempting to implement basic voice capabilities using porcupine.

I have a stream of audio per user and am trying to use the process(frame) method in porcupine on each chunk of data.
In order to get the data single channel and sample rate 16k I am manually decoding the stream using prism-media opus decoder then trying to pass in the chunks:

execute(connection, user, args) {
        userHandlers[user] = new Porcupine([GRASSHOPPER, BUMBLEBEE], [0.5, 0.65]);
        if (!receiver) {
            receiver = connection.receiver;
        }

        userStreams[user] = receiver.createStream(user, {mode: 'opus', end: 'manual'});
        const decoder = new prism.opus.Decoder({frameSize: 640, channels: 1, rate: 16000});
        
        userStreams[user]
        .pipe(decoder);
        
        listeningToUsers[user] = true;

        try {
            console.log("Start utterance");
            decoder.on('data', (chunk) => {//Need to make stream single channel, frame size 512
                let keywordIndex = userHandlers[user].process(chunk);

                if (keywordIndex != -1) {
                    meme.execute(connection, null, args);
                }
            });
        } catch (error) {
            console.error(error);
        }
    },

My issue, however, is that the size of the chunk is 640 whereas it needs to be 512 for the method to work. Changing the frameSize that is passed to the decoder doesn't work due to reasons explained in the answer here.

If anyone knows the best way to convert my data to the correct chunk size, or just a better way of doing this altogether, I'd appreciate it.


Solution

  • I ended up getting this working by using some of the code included in this demo file.

    We include a chunkArray function:

    function chunkArray(array, size) {
        return Array.from({ length: Math.ceil(array.length / size) }, (v, index) =>
            array.slice(index * size, index * size + size)
        );
    }
    

    and change the code posted before to look like this:

    execute(connection, user, args) {
            userHandlers[user] = new Porcupine([GRASSHOPPER, BLUEBERRY], [0.7, 0.85]);
            const frameLength = userHandlers[user].frameLength;
            if (!receiver) {
                receiver = connection.receiver;
            }
            userStreams[user] = receiver.createStream(user, {mode: 'opus', end: 'manual'});
            userDecoders[user] = new prism.opus.Decoder({frameSize: 640, channels: 1, rate: 16000});
            
            userStreams[user]
            .pipe(userDecoders[user]);
            
            listeningToUsers[user] = true;
            userFrameAccumulators[user] = [];
            try {
                userDecoders[user].on('data', (data) => {
                    // Two bytes per Int16 from the data buffer
                    let newFrames16 = new Array(data.length / 2);
                    for (let i = 0; i < data.length; i += 2) {
                        newFrames16[i / 2] = data.readInt16LE(i);
                    }
                    // Split the incoming PCM integer data into arrays of size Porcupine.frameLength. If there's insufficient frames, or a remainder,
                    // store it in 'frameAccumulator' for the next iteration, so that we don't miss any audio data
                    userFrameAccumulators[user] = userFrameAccumulators[user].concat(newFrames16);
                    let frames = chunkArray(userFrameAccumulators[user], frameLength);
    
                    if (frames[frames.length - 1].length !== frameLength) {
                        // store remainder from divisions of frameLength
                        userFrameAccumulators[user] = frames.pop();
                    } else {
                        userFrameAccumulators[user] = [];
                    }
                    for (let frame of frames) {
                        let index = userHandlers[user].process(frame);
                        if (index !== -1) {
                            if (index == 0) {//GRASSHOPPER
                                play.execute(connection, null, args);
                            } else if (index == 1) {//BLUEBERRY
                                play.skip();
                            }
                        }
                    }
                });
            } catch (error) {
                console.error(error);
            }
        }
    

    This will take our decoded stream and then convert it and chunk it appropriately.

    I imagine there are inefficiencies here and improvements to be had, but it is working well in the discord server and I wanted to post an answer in case anyone is trying to integrate discord.js with porcupine in the future.