Search code examples
pythonpyaudioopen-sesame

Voice detection and recording with PyAudio


I am using OpenSesame (an experiment builder using Python) to collect participants' voice response to specific stimuli using PyAudio.

What I want to achieve is the following :

  1. Start recording sound for X seconds (timeout).
  2. Compute "loudness" of sound and compare to a pre-defined threshold.
  3. Get the time when the sound reaches the threshold if it does.
  4. Save the sound in a .wav file when recording is done.

It's working quite well except that the sound recorded stops when the threshold is reached.

How could I continue recording the sound (and compute the loudness) when the threshold is reached (until timeout, for example)?

The code was adapted from [here][1]. Here is my code:

import pyaudio
import struct
import math 
import wave

timeout = 5000
sound_threshold = 0.001

CHUNK = 1024
SHORT_NORMALIZE = (1.0/32768.0)
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

def get_rms(block):

    """Get root mean square as a measure of loudness"""

    count = len(block)/2
    format = "%dh" % (count)
    shorts = struct.unpack( format, block )
    sum_squares = 0.0
    for sample in shorts:
        n = sample * SHORT_NORMALIZE
        sum_squares += n*n
    return math.sqrt( sum_squares / count )

stream = p.open(format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK)

print("* recording")
frames = []

start_time = clock.time()
while True:
    if clock.time() - start_time >= timeout:
        var.response_time = timeout
        var.response = u'timeout'
        var.loudness = None
        var.in_clock_time = clock.time()
        var.start = start_time
        break   
    try:
        block = stream.read(CHUNK)
        frames.append(block)
    except IOError as e:
        print(e)
    loudness = get_rms(block)
    print(loudness)
    if loudness > sound_threshold:
        var.response_time = clock.time() - start_time
        var.response = u'detected'
        var.in_clock_time = clock.time()
        var.start = start_time
        var.loudness = loudness
        break
        
print(response)
print(response_time)
print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
´´´

  [1]: https://forum.cogsci.nl/discussion/1772/detecting-voice-onsets-voicekey

Solution

  • Here is how I solved the issue (even though not sure it's the most optimal way to do it) :

    1. I started a timer in the While loop until timeout is reached

    2. I created a list gathering the loudness and clock.time at each iteration

    3. I extracted the first time at which the loudness threshold is reached with :

      var.response_time = (df.loc[df['loudness'] > 
      sound_threshold,'clock_time'].iloc[0])- start_time
      
    4. I used Try/Except to deal with situations when timeout is reached without response

    Here is the full code:

    import pyaudio
    import struct
    import math 
    import wave
    import pandas as pd
    
    timeout = 5000
    sound_threshold = 0.001
    
    FORMAT = pyaudio.paInt16
    SHORT_NORMALIZE = (1.0/32768.0)
    CHANNELS = 2
    RATE = 44100
    INPUT_BLOCK_TIME = 0.01
    INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME)
    filename = "output.wav"
    chunk=1024
    
    p = pyaudio.PyAudio()
    def get_rms(block):
    """Get root mean square as a measure of loudness"""
       count = len(block)/2
       format = "%dh" % (count)
       shorts = struct.unpack( format, block )
       sum_squares = 0.0
       for sample in shorts:
           n = sample * SHORT_NORMALIZE
           sum_squares += n*n
       return math.sqrt( sum_squares / count )
    
    # Open the mic
    stream = p.open(format=FORMAT,channels=CHANNELS,
                   rate=RATE, input=True, input_device_index=0, 
                   frames_per_buffer=INPUT_FRAMES_PER_BLOCK)
    
    # Listen for sounds until a sound is detected or a timeout occurs.
    print("* recording")
    frames = []
    list_ldn = []
    list_ct = []
    
    # Start a timer until timeout and compute loudness/clocktime for each block - append to lists
    start_time = clock.time()
    while clock.time() - start_time <= timeout:
        try:
           block = stream.read(chunk)
           frames.append(block)
        except IOError as e:
           print(e)
       loudness = get_rms(block)
       list_ldn.append(loudness)
       list_ct.append(clock.time())
    
    # Close the audio stream
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    
    # merge the 2 lists to a df and compute response_time & response
    df = pd.DataFrame({'clock_time':list_ct,'loudness':list_ldn})
       try:
         var.response_time = (df.loc[df['loudness'] > sound_threshold, 
         'clock_time'].iloc[0]) - start_time
         var.response = u'detected'
    # use except to deal with errors when no response is detected
       except IndexError:
         var.response_time = timeout
         var.response = u'timeout'