Search code examples
javascriptweb-audio-apiopusweb-mediarecorderwebcodecs

Using WEB API AudioEncoder to output opus frames


I am interested in using Web API AudioEncoder to produce audio chunks compatible with some opus decoder e.g. this

const stream= await navigator.mediaDevices.getUserMedia({audio:{}});
const rec = new MediaRecorder(stream, {rate: 16000});
rec.ondataavailable = e => {
  audioChunks.push(e.data);
  console.log(e.data)
}

This will usually give the data when I stop recording, alternatively, I can do

var context = new AudioContext({sampleRate: 16000})
const mediaSource = context.createMediaSource(stream);
const bufferLength = 1280;
const node = context.createScriptProcessor(bufferLength, 2, 2);
node.onaudioprocess = (e) => {
  const data = e.inputBUffer.getChannelData(0);
  console.log(data);
}
source.connect(node);
node.connect(context.destination);

That is more appropriate to my application because it would give a chunk of data every time it reaches buffer length, in this case 2560 at 16kHz corresponds to 40ms.

How can I record the input in a Audio encoding capable browser and as I record produce opus frames that I can then decode using a standard opus decoder.


Solution

  • Putting the pieces together

    If you are interested here is a code snippet that (by the time I write this answer) will fail due to DOMException: Invalid security origin

    /**
     * Fill a dropdown with audio input options
     * @param {HTMLSelectElement} select 
     */
    async function enumerateAudioDevices(select){
      while (select.firstChild) {
        select.removeChild(select.firstChild);
      }
      for(const deviceInfo of await navigator.mediaDevices.enumerateDevices()){
        const option = document.createElement('option');
        option.value = deviceInfo.deviceId;
        if (deviceInfo.kind === 'audioinput') {
          option.text = deviceInfo.label || `microphone ${select.length + 1}`;
          select.appendChild(option);
        }
      }
    }
    
    /**
     * Fill a dropdown with common sample rate options
     * @param {HTMLSelectElement} select 
     */
    function fillCommonSampleRate(select){
      while (select.firstChild) {
        select.removeChild(select.firstChild);
      }
      for(const [value, label] of [
        [16000, '16kHz'], 
        [24000, '24kHz'], 
        [32000, '32kHz'],
        [44100, '44.1kHz'], 
        [48000, '48kHz']
      ]){
        const option = document.createElement('option');
        option.value = value;
        option.text = label;
        select.appendChild(option);
      }
    }
    
    class CapturePipeline {
      constructor(sourceId, codec='opus', sampleRate=16000){
        this.sampleRate = sampleRate;
        this.codec = codec;
        this.sourceId = sourceId;
        /**
         * @type {(AudioData) => any}
         */
        this.onrawdata = null;
        /**
         * @type {(EncodedAudioChunk, EncodingMetadata) => any}
         */
        this.onencoded = null;
      }
      async connect(){
        const mic = navigator.mediaDevices.getUserMedia(this.deviceId ? {
          audio: { exact: {deviceId: this.deviceId}}
        } : {audio: true})
        /**
         * @type {AudioContext}
         */
        this.audioContext = new (AudioContext || webkitAudioContext)({
          sampleRate: this.sampleRate,
          numberOfChannels: 1,
          latencyHint: 'interactive'
        })
        this.mic = await mic;
        /**
         * @type {MediaStreamAudioSourceNode}
         */
        this.source = this.audioContext.createMediaStreamSource(this.mic)
        /**
         * @type {MediaStreamAudioDestinationNode}
         */
        this.destination = this.audioContext.createMediaStreamDestination()
        this.destination.channelCount = 1;
        this.source.connect(this.destination)
        /**
         * @type {AudioEncoder}
         */
        this.encoder = new AudioEncoder({
          output: this.handleEncodedData.bind(this),
          error: this.handleEncodingError.bind(this)
        })
    
        this.encoder.configure({
          codec: this.codec,
          numberOfChannels: 1,
          sampleRate: this.sampleRate
        })
        /**
         * @type {MediaStreamTrackProcessor}
         */
        this.audioTrackProcessor = new MediaStreamTrackProcessor({
          track: this.destination.stream.getAudioTracks()[0]
        })
        this.audioTrackProcessor.readable.pipeTo(new WritableStream({
          write: this.handleRawData.bind(this)
        }))
      }
      disconnect(){
        this.source.disconnect();
        delete this.audioTrackProcessor;
        delete this.encoder;
        delete this.destination;
        delete this.mic;
        delete this.source;
      }
      /**
       * {EncodedAudioChunk} chunk
       * {EncodingMetadata} metadata
       */
      handleEncodedData(chunk, metadata){
        if(this.onencoded){
          this.onencoded(chunk, metadata)
        }
        const data = new ArrayBuffer(chunk.byteLength)
        chunk.copyTo(data);
      }
      handleEncodingError(e){
        console.log(e);
      }
    
      /**
       * @param {AudioData} audioData 
       */
      handleRawData(audioData){
        if(this.onrawdata){
          this.onrawdata(audioData)
        }
        this.encoder.encode(audioData)
        audioData.close()
      }
    }
    
    
    //////////////////////////////////////////////////////////////////////
    
    window.addEventListener('load', setup)
    function setup(){
      const audioSourceSelector = document.body.querySelector('select#audio-source')
      const audioCodecSelector = document.body.querySelector('select#audio-codec')
      const sampleFrequencySelector = document.body.querySelector('select#sample-frequency')
      const startRecordingBtn = document.querySelector('button#start-recording')
      const volumeBar = document.querySelector('div#volume-bar')
      const encodedLengthBar = document.body.querySelector('div#encoded-length')
      const rawLengthBar = document.body.querySelector('div#raw-length');
      /**
       * {CapturePipeline}
       */
      let pipeline = null;
      let rawLength = 0;
      let encodedLength = 0;
      let audioSampleArray = new Float32Array(0)
      setInterval(() => {
        if(rawLength > 0 && encodedLength > 0){
          const ref = Math.max(encodedLength, rawLength)
          encodedLengthBar.style.width = (90 * encodedLength / ref).toFixed(2) + '%';
          encodedLengthBar.textContent = (encodedLength / 1024).toFixed(1) + 'kB'
          rawLengthBar.style.width = (90 * rawLength / ref).toFixed(2) + '%';
          rawLengthBar.textContent = (rawLength / 1024).toFixed(1) + 'kB'
        }
      }, 250)
    
      enumerateAudioDevices(audioSourceSelector)
      fillCommonSampleRate(sampleFrequencySelector)
      for(const select of [
        audioSourceSelector, 
        audioCodecSelector, 
        sampleFrequencySelector]
      ) {
        select.addEventListener('change', e => {
          if(pipeline)pipeline.disconnect();
          pipeline = null;
        })
      }
      startRecordingBtn.addEventListener('click', async e => {
        pipeline = new CapturePipeline(
          audioSourceSelector.value,
          audioCodecSelector.value,
          +sampleFrequencySelector.value
        )
        pipeline.onrawdata = (audioData) => {
          rawLength += audioData.numberOfFrames * 2;
          if(audioData.numberOfFrames > audioSampleArray.length){
            audioSampleArray = new Float32Array(audioData.numberOfFrames)
          }
          audioData.copyTo(audioSampleArray, {planeIndex: 0});
          const rms = Math.sqrt(audioSampleArray
            .map(x => x*x)
            .reduce((a,b) => a+b) / audioSampleArray.length);
          volumeBar.style.width = (rms * 500) + 'px'
        }
        pipeline.onencoded = (chunk) => {
          encodedLength += chunk.byteLength
        }
        await pipeline.connect()
      })
    }
    // Unnecessary, but if it looks better, why not?
    
    
    #banner-message {
      background: #fff;
      border-radius: 4px;
      padding: 20px;
      font-size: 25px;
      text-align: center;
      transition: all 0.2s;
      margin: 0 auto;
      width: 300px;
    }
    
    button {
      background: #0084ff;
      border: none;
      border-radius: 5px;
      padding: 8px 14px;
      font-size: 15px;
      color: #fff;
    }
    
    #banner-message.alt {
      background: #0084ff;
      color: #fff;
      margin-top: 40px;
      width: 200px;
    }
    
    #banner-message.alt button {
      background: #fff;
      color: #000;
    }
    
    .horizontal-heat {
      width: 500px;
      height: 10px;
      background: linear-gradient(to right, green, yellow, red);
    }
    div.mask {
      width:500px; height: 10px; overflow:hidden;
    }
    
    div.plot-bar {
      background-color:#40c4ff;
      border: 1px solid #404080;
      overflow: visible;
      white-space: pre;
    }
    <div>
      <form>
        <fieldset><legend>RECORD AUDIO</legend>
        <select id="audio-source"></select>
        <select id="sample-frequency"></select>
        <select id="audio-codec">
          <option value="opus" selected>Opus</option>
          <option value="vorbis">Vorbis</option>
          <option value="mp3">MP3</option>
          <option value="alaw">A-law PCM</option>
          <option value="ulaw">&mu;-law PCM</option>
          <option value="pcm">Linear PCM</option>
        </select>
        </fieldset>
      </form>
    
      <button id="start-recording">start recording</button>
      <hr>
    
      Volume
      <div id='volume-bar' class="mask">
      <div class="horizontal-heat">
      </div>
      </div>
      
      <div>
        Encoded length
        <div id="encoded-length" class="plot-bar"></div>
        s16 PCM Wave length
        <div id="raw-length" class="plot-bar" style="width: 90%"></div>
      </div>
    </div>