javascript web-audio-api opus web-mediarecorder webcodecs

Using WEB API AudioEncoder to output opus frames

I am interested in using Web API AudioEncoder to produce audio chunks compatible with some opus decoder e.g. this

const stream= await navigator.mediaDevices.getUserMedia({audio:{}});
const rec = new MediaRecorder(stream, {rate: 16000});
rec.ondataavailable = e => {
  audioChunks.push(e.data);
  console.log(e.data)
}

This will usually give the data when I stop recording, alternatively, I can do

var context = new AudioContext({sampleRate: 16000})
const mediaSource = context.createMediaSource(stream);
const bufferLength = 1280;
const node = context.createScriptProcessor(bufferLength, 2, 2);
node.onaudioprocess = (e) => {
  const data = e.inputBUffer.getChannelData(0);
  console.log(data);
}
source.connect(node);
node.connect(context.destination);

That is more appropriate to my application because it would give a chunk of data every time it reaches buffer length, in this case 2560 at 16kHz corresponds to 40ms.

How can I record the input in a Audio encoding capable browser and as I record produce opus frames that I can then decode using a standard opus decoder.

Solution

Putting the pieces together

Create an AudioEncoder, passing output and error handlers. And then configure it to use {codec: 'opus'}
Use MediaDevices.getUserMedia to get microphone data.
construct an instance of AudioContext, passing your {sampleRate, latencyHint: 'interactive'}. Latency hint interactive gives low latency and is the default option, other options are balanced and playback that will accept higher latency to get lower power consumption.
You get the microphone data in the audio context by passing the microphone stream to audioContext.createMediaSource
You dow-mix the audio by setting channelCount from a MediaStreamAudioDestinationNode using audioContext.createMediaStreamDestination to 1.
Create a non standard MediaStreamTrackProcessor passing the first track of the destination node stream.
Get the required AudioData by piping the readable stream of the track processor to a new WritableStream

If you are interested here is a code snippet that (by the time I write this answer) will fail due to DOMException: Invalid security origin

/**
 * Fill a dropdown with audio input options
 * @param {HTMLSelectElement} select 
 */
async function enumerateAudioDevices(select){
  while (select.firstChild) {
    select.removeChild(select.firstChild);
  }
  for(const deviceInfo of await navigator.mediaDevices.enumerateDevices()){
    const option = document.createElement('option');
    option.value = deviceInfo.deviceId;
    if (deviceInfo.kind === 'audioinput') {
      option.text = deviceInfo.label || `microphone ${select.length + 1}`;
      select.appendChild(option);
    }
  }
}

/**
 * Fill a dropdown with common sample rate options
 * @param {HTMLSelectElement} select 
 */
function fillCommonSampleRate(select){
  while (select.firstChild) {
    select.removeChild(select.firstChild);
  }
  for(const [value, label] of [
    [16000, '16kHz'], 
    [24000, '24kHz'], 
    [32000, '32kHz'],
    [44100, '44.1kHz'], 
    [48000, '48kHz']
  ]){
    const option = document.createElement('option');
    option.value = value;
    option.text = label;
    select.appendChild(option);
  }
}

class CapturePipeline {
  constructor(sourceId, codec='opus', sampleRate=16000){
    this.sampleRate = sampleRate;
    this.codec = codec;
    this.sourceId = sourceId;
    /**
     * @type {(AudioData) => any}
     */
    this.onrawdata = null;
    /**
     * @type {(EncodedAudioChunk, EncodingMetadata) => any}
     */
    this.onencoded = null;
  }
  async connect(){
    const mic = navigator.mediaDevices.getUserMedia(this.deviceId ? {
      audio: { exact: {deviceId: this.deviceId}}
    } : {audio: true})
    /**
     * @type {AudioContext}
     */
    this.audioContext = new (AudioContext || webkitAudioContext)({
      sampleRate: this.sampleRate,
      numberOfChannels: 1,
      latencyHint: 'interactive'
    })
    this.mic = await mic;
    /**
     * @type {MediaStreamAudioSourceNode}
     */
    this.source = this.audioContext.createMediaStreamSource(this.mic)
    /**
     * @type {MediaStreamAudioDestinationNode}
     */
    this.destination = this.audioContext.createMediaStreamDestination()
    this.destination.channelCount = 1;
    this.source.connect(this.destination)
    /**
     * @type {AudioEncoder}
     */
    this.encoder = new AudioEncoder({
      output: this.handleEncodedData.bind(this),
      error: this.handleEncodingError.bind(this)
    })

    this.encoder.configure({
      codec: this.codec,
      numberOfChannels: 1,
      sampleRate: this.sampleRate
    })
    /**
     * @type {MediaStreamTrackProcessor}
     */
    this.audioTrackProcessor = new MediaStreamTrackProcessor({
      track: this.destination.stream.getAudioTracks()[0]
    })
    this.audioTrackProcessor.readable.pipeTo(new WritableStream({
      write: this.handleRawData.bind(this)
    }))
  }
  disconnect(){
    this.source.disconnect();
    delete this.audioTrackProcessor;
    delete this.encoder;
    delete this.destination;
    delete this.mic;
    delete this.source;
  }
  /**
   * {EncodedAudioChunk} chunk
   * {EncodingMetadata} metadata
   */
  handleEncodedData(chunk, metadata){
    if(this.onencoded){
      this.onencoded(chunk, metadata)
    }
    const data = new ArrayBuffer(chunk.byteLength)
    chunk.copyTo(data);
  }
  handleEncodingError(e){
    console.log(e);
  }

  /**
   * @param {AudioData} audioData 
   */
  handleRawData(audioData){
    if(this.onrawdata){
      this.onrawdata(audioData)
    }
    this.encoder.encode(audioData)
    audioData.close()
  }
}


//////////////////////////////////////////////////////////////////////

window.addEventListener('load', setup)
function setup(){
  const audioSourceSelector = document.body.querySelector('select#audio-source')
  const audioCodecSelector = document.body.querySelector('select#audio-codec')
  const sampleFrequencySelector = document.body.querySelector('select#sample-frequency')
  const startRecordingBtn = document.querySelector('button#start-recording')
  const volumeBar = document.querySelector('div#volume-bar')
  const encodedLengthBar = document.body.querySelector('div#encoded-length')
  const rawLengthBar = document.body.querySelector('div#raw-length');
  /**
   * {CapturePipeline}
   */
  let pipeline = null;
  let rawLength = 0;
  let encodedLength = 0;
  let audioSampleArray = new Float32Array(0)
  setInterval(() => {
    if(rawLength > 0 && encodedLength > 0){
      const ref = Math.max(encodedLength, rawLength)
      encodedLengthBar.style.width = (90 * encodedLength / ref).toFixed(2) + '%';
      encodedLengthBar.textContent = (encodedLength / 1024).toFixed(1) + 'kB'
      rawLengthBar.style.width = (90 * rawLength / ref).toFixed(2) + '%';
      rawLengthBar.textContent = (rawLength / 1024).toFixed(1) + 'kB'
    }
  }, 250)

  enumerateAudioDevices(audioSourceSelector)
  fillCommonSampleRate(sampleFrequencySelector)
  for(const select of [
    audioSourceSelector, 
    audioCodecSelector, 
    sampleFrequencySelector]
  ) {
    select.addEventListener('change', e => {
      if(pipeline)pipeline.disconnect();
      pipeline = null;
    })
  }
  startRecordingBtn.addEventListener('click', async e => {
    pipeline = new CapturePipeline(
      audioSourceSelector.value,
      audioCodecSelector.value,
      +sampleFrequencySelector.value
    )
    pipeline.onrawdata = (audioData) => {
      rawLength += audioData.numberOfFrames * 2;
      if(audioData.numberOfFrames > audioSampleArray.length){
        audioSampleArray = new Float32Array(audioData.numberOfFrames)
      }
      audioData.copyTo(audioSampleArray, {planeIndex: 0});
      const rms = Math.sqrt(audioSampleArray
        .map(x => x*x)
        .reduce((a,b) => a+b) / audioSampleArray.length);
      volumeBar.style.width = (rms * 500) + 'px'
    }
    pipeline.onencoded = (chunk) => {
      encodedLength += chunk.byteLength
    }
    await pipeline.connect()
  })
}

// Unnecessary, but if it looks better, why not?


#banner-message {
  background: #fff;
  border-radius: 4px;
  padding: 20px;
  font-size: 25px;
  text-align: center;
  transition: all 0.2s;
  margin: 0 auto;
  width: 300px;
}

button {
  background: #0084ff;
  border: none;
  border-radius: 5px;
  padding: 8px 14px;
  font-size: 15px;
  color: #fff;
}

#banner-message.alt {
  background: #0084ff;
  color: #fff;
  margin-top: 40px;
  width: 200px;
}

#banner-message.alt button {
  background: #fff;
  color: #000;
}

.horizontal-heat {
  width: 500px;
  height: 10px;
  background: linear-gradient(to right, green, yellow, red);
}
div.mask {
  width:500px; height: 10px; overflow:hidden;
}

div.plot-bar {
  background-color:#40c4ff;
  border: 1px solid #404080;
  overflow: visible;
  white-space: pre;
}

<div>
  <form>
    <fieldset><legend>RECORD AUDIO</legend>
    <select id="audio-source"></select>
    <select id="sample-frequency"></select>
    <select id="audio-codec">
      <option value="opus" selected>Opus</option>
      <option value="vorbis">Vorbis</option>
      <option value="mp3">MP3</option>
      <option value="alaw">A-law PCM</option>
      <option value="ulaw">&mu;-law PCM</option>
      <option value="pcm">Linear PCM</option>
    </select>
    </fieldset>
  </form>

  <button id="start-recording">start recording</button>
  <hr>

  Volume
  <div id='volume-bar' class="mask">
  <div class="horizontal-heat">
  </div>
  </div>
  
  <div>
    Encoded length
    <div id="encoded-length" class="plot-bar"></div>
    s16 PCM Wave length
    <div id="raw-length" class="plot-bar" style="width: 90%"></div>
  </div>
</div>