Related
I am trying to run a colab file training openAI's jukebox, however when I try to run the function code which loads the audio, I am getting an error:
File "/content/jukebox/jukebox/data/files_dataset.py", line 82, in get_song_chunk
data, sr = load_audio(filename, sr=self.sr, offset=offset, duration=self.sample_length)
File "/content/jukebox/jukebox/utils/io.py", line 48, in load_audio
frame = frame.to_ndarray(format='fltp') # Convert to floats and not int16
AttributeError: 'list' object has no attribute 'to_ndarray'
It seems to be interpreting the frame input as a list, which when printed looks like this:
[<av.AudioFrame 0, pts=None, 778 samples at 22050Hz, stereo, fltp at
0x7fd03dd64150>]
When I try to change to frame = resampler.resample(frame) I get this error:
TypeError: 'av.audio.frame.AudioFrame' object cannot be interpreted as
an integer
I don't really know much about audio files so i'm not sure how to debug and would appreciate help here.
the full code to load the audio is below.
def load_audio(file, sr, offset, duration, resample=True, approx=False, time_base='samples', check_duration=True):
if time_base == 'sec':
offset = offset * sr
duration = duration * sr
# Loads at target sr, stereo channels, seeks from offset, and stops after duration
container = av.open(file)
audio = container.streams.get(audio=0)[0] # Only first audio stream
audio_duration = audio.duration * float(audio.time_base)
if approx:
if offset + duration > audio_duration*sr:
# Move back one window. Cap at audio_duration
offset = np.min(audio_duration*sr - duration, offset - duration)
else:
if check_duration:
assert offset + duration <= audio_duration*sr, f'End {offset + duration} beyond duration {audio_duration*sr}'
if resample:
resampler = av.AudioResampler(format='fltp',layout='stereo', rate=sr)
else:
assert sr == audio.sample_rate
offset = int(offset / sr / float(audio.time_base)) #int(offset / float(audio.time_base)) # Use units of time_base for seeking
duration = int(duration) #duration = int(duration * sr) # Use units of time_out ie 1/sr for returning
sig = np.zeros((2, duration), dtype=np.float32)
container.seek(offset, stream=audio)
total_read = 0
for frame in container.decode(audio=0): # Only first audio stream
if resample:
frame.pts = None
frame = resampler.resample(frame)
frame = frame.to_ndarray(format='fltp') # Convert to floats and not int16
read = frame.shape[-1]
if total_read + read > duration:
read = duration - total_read
sig[:, total_read:total_read + read] = frame[:, :read]
total_read += read
if total_read == duration:
break
assert total_read <= duration, f'Expected {duration} frames, got {total_read}'
return sig, sr
If your variable frame is interpreted as a list, you could replace frame = resampler.resample(frame) with frame = resampler.resample(frame)[0]. Your code ran without errors once I made this edit.
Try replacing frame = frame.to_ndarray(format='fltp') by a direct assignation of the variable frame:
import numpy as np
#frame = frame.to_ndarray(format='fltp') #Original line
frame = np.ndarray(frame)
If you want it to be a specific data type, you can change the dtype argument of the ndarray function:
frame = np.ndarray(frame, dtype=np.float32)
Try: frame = frame[0].to_ndarray(format='fltp')
I coded an experiment in which participants are presented with a series of visual stimuli (stim duration: 100ms, trial duration: 500ms). Simultaneously with the onset of the visual stimuli, there is a sound playing for 100 ms.
Some of the visual stimuli are targets and participants should press spacebar when they detect the target.
I want to know participants' reaction times to the target. So I store, using event.getKey, the global time when the spacebar was pressed. I store a global time to compare the time of the onset of the trial with the time when spacebar was pressed. I do that because my inter-trial interval is short and it can happen that participants will respond to the target during the following trial.
The code seem to work when I comment out sd.play of the sound, but as soon as the sound is played, the reaction times seem off and it always stores it in the trial following the target trial (even though I know I pressed spacebar during target trial).
Did anyone encounter this problem before?
Below is the code for the procedure:
def response_check(key):
"""
Checks if a key was pressed.
Keyword arguments:
key -- containing either a keypress and a time or nothing (list)
return:
time -- nan if not pressed or time of press if pressed
"""
if len(key) == 0:
pressed = 0
elif 'space' in key[0]:
pressed = 1
if pressed == 1:
time = key[0][1]
elif pressed == 0:
time = 'nan'
return str(time), pressed
for t in range(n_trials): # n_trials is the total amount of trials
show_target_crosses(pauses, t, trial_paradigm[t], hi_targets, low_targets) # show target
l_trial_start = globalClock.getTime()
check4esc() # check for esc
#set stimuli according to condition
standing = visual.Rect(win=win, name='up_cross_hor', width=(dimentions[1]),
height=(dimentions[0]), ori=0, pos=(0, 0), lineWidth=1,
lineColor=colors[all_crosses[trial_paradigm[t]][t]],
lineColorSpace='rgb', fillColor=colors[all_crosses[trial_paradigm[t]][t]],
fillColorSpace='rgb', opacity=1, depth=0.0, interpolate=True)
laying = visual.Rect(win=win, name='up_cross_hor', width=(dimentions[0]),
height=(dimentions[1]), ori=0, pos=(0, position[all_crosses[trial_paradigm[t]][t]]), lineWidth=1,
lineColor=colors[all_crosses[trial_paradigm[t]][t]],
lineColorSpace='rgb', fillColor=colors[all_crosses[trial_paradigm[t]][t]],
fillColorSpace='rgb', opacity=1, depth=0.0, interpolate=True)
sd.play(all_sounds[all_paradigms[trial_paradigm[t]][t]], fs) # Play sound
if first_seven[t] == 0:
if all_responses[trial_paradigm[t]][t] == 0:
trigger(trig_list[trial_paradigm[t]][all_paradigms[trial_paradigm[t]][t]],0.01) # send sound trigger
elif all_responses[trial_paradigm[t]][t] == 1:
trigger(trig_list_targets[trial_paradigm[t]][all_paradigms[trial_paradigm[t]][t]],0.01)
core.wait(0.06) # adjust diode to sound delay
standing.draw() # vertical bar
laying.draw() # horizontal bar
whiteOn.draw() # square
win.flip() # show cross and white square for fotodiode
core.wait(0.1) # show cross 100 ms
win.flip() # turn visual stuff off
core.wait(0.032) # adjust ITI
l_fp = int(ok_data[0])
l_block_nr = blocks[t]+1
l_trial_nr = (range(367)*n_blocks)[t]+1
l_condition = trial_paradigm[t]
l_sound = all_sounds_names[all_paradigms[trial_paradigm[t]][t]]
if first_seven[t] == 0:
if all_responses[trial_paradigm[t]][t] == 0:
l_trigger = trig_list[trial_paradigm[t]][all_paradigms[trial_paradigm[t]][t]] # send sound trigger
elif all_responses[trial_paradigm[t]][t] == 1:
l_trigger = trig_list_targets[trial_paradigm[t]][all_paradigms[trial_paradigm[t]][t]]
elif first_seven[t] == 1:
l_trigger = 999
l_target = all_responses[trial_paradigm[t]][t]
l_cross_condition = all_crosses[trial_paradigm[t]][t]
key = event.getKeys(keyList = ['space'], timeStamped = globalClock)
l_response_time = response_check(key)[0]
# Save data to file
#'fp\tblock_nr\ttrial_nr\tcondition\tsound\ttrigger\ttarget\tcross_cond\ttrial_start\tresponse_time\n'
dataFile.write('%i\t%i\t%i\t%i\t%s\t%i\t%i\t%i\t%f\t%s\n' %(
l_fp, l_block_nr, l_trial_nr, l_condition, l_sound, l_trigger,
l_target, l_cross_condition, l_trial_start, l_response_time))
paus(t, pauses, blocks, trig = 192) # check for pauses
=========== EDIT ============
Below I paste the MCVE version of the whole experiment:
from psychopy import visual
from psychopy import core, gui, data, event, parallel
import sounddevice as sd
import time, random, math, sys
import numpy as np
# Functions --------------------------------------------------------------------
def response_check(key):
"""
Checks if a key was pressed.
Keyword arguments:
key -- containing either a keypress and a time or nothing (list)
return:
time -- nan if not pressed or time of press if pressed
"""
if len(key) == 0:
pressed = 0
elif 'space' in key[0]:
pressed = 1
if pressed == 1:
time = key[0][1]
elif pressed == 0:
time = 'nan'
return str(time), pressed
def create_sinusoid (freq = 1000, phase = 0, fs = 48000, dur = 1):
'''Create a sinusoid of specified length with amplitude -1 to 1. Use
set_gain() and fade() to set amplitude and fade-in-out.
Keyword arguments:
frequency -- frequency in Hz (float)
phase -- phase in radians (float)
fs -- sampling frequency (int)
duration -- duration of signal in seconds (float).
Return:
sinusoid -- monosignal of sinusoid (1xn numpy array)
'''
t = np.arange(0, dur, 1.0/fs) # Time vector
sinusoid = np.sin(phase + 2*np.pi* freq * t) # Sinusoid (mono signal)
return sinusoid
def fade(monosignal,samples):
'''Apply a raised cosine to the start and end of a mono signal.
Keyword arguments:
monosignal -- vector (1xn numpy array).
samples -- number of samples of the fade (integer). Make sure that:
2*samples < len(monosignal)
Return:
out -- faded monosignal (1xn numpy array)
'''
ramps = 0.5*(1-np.cos(2*np.pi*(np.arange(2*samples))/(2*samples-1)))
fadein = ramps[0:samples]
fadeout = ramps[samples:len(ramps)+1]
plateu = np.ones(len(monosignal)-2*samples)
weight = np.concatenate((fadein,plateu,fadeout))
out = weight*monosignal
return out
def set_gain(mono, gaindb):
''' Set gain of mono signal, to get dB(rms) to specified gaindb
Keyword arguments:
mono -- vector (numpy array).
gaindb -- gain of mono in dB re max = 0 dB (float).
Return:
gained -- monosignal (numpy array)
'''
rms = np.sqrt(np.mean(mono**2))
adjust = gaindb - 20 * np.log10(rms)
gained = 10**(adjust/20.0) * mono # don't forget to make 20 a float (20.0)
# Print warning if overload, that is, if any abs(sample-value) > 1
if (np.max(np.abs(gained)) > 1):
message1 = "WARNING: set_gain() generated overloaded signal!"
message2 = "max(abs(signal)) = " + str(np.max(np.abs(gained)))
message3 = ("number of samples >1 = " +
str(np.sum(1 * (np.abs(gained) > 1))))
print message1
print message2
print message3
return gained
# Screen
win = visual.Window([800, 600], allowGUI = False, # [1920, 1080]
monitor = 'testMonitor', units = 'height', color = 'gray')
# ==============================================================================
# TONE ORDER AND RESPONSES ----------------------------------------------------
# 1 - 500 Hz
# 0 - 550 Hz
# 2 - 605 Hz
# 3 - 666 Hz
# 4 - 732 Hz
# 5 - 805 Hz
# 6 - 886 Hz
# 7 - 974 Hz
tone_order = np.random.choice([0,1,2,3,4,5,6,7], 20, replace = True)
targets = np.random.choice([1,0,0,0,0]*4, 20, replace = False)
# ==============================================================================
# CREATE SOUNDS ----------------------------------------------------------------
#sd.default.device = "ASIO Fireface USB"
print 'Sound device ------------------------------------------------------------'
print sd.query_devices()#device = "ASIO Fireface USB")
print '-------------------------------------------------------------------------'
# Set the gain and sampling frequency (fs)
gain = -30
fs = 44100
frequencies = [500, 550, 605, 666, 732, 805, 886, 974]
tones = [0]*8
for t in range(len(frequencies)):
tones[t] = set_gain(fade(create_sinusoid(
freq = frequencies[t], phase = 0, fs = fs, dur = 0.1),441),gain) # 100 ms, 10 ms fade in/out
f_500 = np.transpose(np.array([tones[0],tones[0]])) # deviant, control
f_550 = np.transpose(np.array([tones[1],tones[1]])) # standard
f_605 = np.transpose(np.array([tones[2],tones[2]]))
f_666 = np.transpose(np.array([tones[3],tones[3]]))
f_732 = np.transpose(np.array([tones[4],tones[4]]))
f_805 = np.transpose(np.array([tones[5],tones[5]]))
f_886 = np.transpose(np.array([tones[6],tones[6]]))
f_974 = np.transpose(np.array([tones[7],tones[7]]))
all_tones = [f_500, f_550, f_605, f_666, f_732, f_805, f_886, f_974]
# ==============================================================================
# CREATE VISUALS ---------------------------------------------------------------
stimulus = visual.TextStim(
win, color = 'white', height = 0.03, pos = (0, 0), text = '')
# ==============================================================================
# Make a text file to save data ------------------------------------------------
fileName = 'test'
dataFile = open(fileName+'.txt', 'w')
dataFile.write('soundCond\ttarget\ttrial_start\tresponse_time\n')
# ==============================================================================
# Keep track of time -----------------------------------------------------------
globalClock = core.Clock()
respClock = core.Clock()
# ==============================================================================
# Experimental procedure -------------------------------------------------------
# Trial loop
for t in range(len(tone_order)):
l_trial_start = globalClock.getTime()
#set stimuli according to condition
if targets[t] == 0:
stimulus.text = '+'
else:
stimulus.text = 'o'
sd.play(all_tones[tone_order[t]], fs) # Play sound for current trial
core.wait(0.08) # adjust visual to sound delay
stimulus.draw() # vertical bar
win.flip() # show cross and white
core.wait(0.1) # show cross 100 ms
win.flip() # turn visual stuff off
core.wait(0.26) # adjust ITI
l_sound = tone_order[t]
l_target = targets[t]
key = event.getKeys(keyList = ['space'], timeStamped = globalClock)
l_response_time = response_check(key)[0]
# Save data to file
#'soundCond\ttarget\ttrial_start\tresponse_time\n'
dataFile.write('%i\t%i\t%f\t%s\n' %(
l_sound, l_target, l_trial_start, l_response_time))
dataFile.close()
Your second code example shows that you are using PsychoPy.
Why are you not using its audio capabilities?
Incidentally, the sounddevice module can be used as audio backend in PsychoPy and they are using an sd.OutputStream and a callback function internally, just as I suggested.
But if you use PsychoPy's audio functions, you don't really have to worry about that.
BTW, the PsychoPy community is really helpful, check out their forum: https://discourse.psychopy.org/.
Regarding this comment:
Our program is extremely simple
Playing audio with exact timing is never simple.
There are big platform-dependent differences and you should always measure if you want to make sure the timing is right.
I need to do some real time audio signal processing with Python, i.e. analyze the signal in the frequency domain by framing, windowing and computing the FFT, and then apply some filters depending on the analysis results. I've been using PyAudio for audio acquisition and PyQtGraph for waveform and FFT visualization, as suggested in this and this code.
For now my code only detects the N power spectrum bins with the highest value and highlights them by drawing vertical lines on the FFT plot. Here is what is looks like :
import pyaudio
import numpy as np
from scipy.signal import argrelextrema
import pyqtgraph as pg
from pyqtgraph.Qt import QtCore
##Some settings
FORMAT = pyaudio.paFloat32
CHANNELS = 1
FS = 44100
CHUNK = 256
NFFT = 2048
OVERLAP = 0.5
PLOTSIZE = 32*CHUNK
N = 4
freq_range = np.linspace(10, FS/2, NFFT//2 + 1)
df = FS/NFFT
HOP = NFFT*(1-OVERLAP)
##Some preliminary functions
def db_spectrum(data) : #computes positive frequency power spectrum
fft_input = data*np.hanning(NFFT)
spectrum = abs(np.fft.rfft(fft_input))/NFFT
spectrum[1:-1] *= 2
return 20*np.log10(spectrum)
def highest_peaks(spectrum) : #finds peaks (local maxima) and picks the N highest ones
peak_indices = argrelextrema(spectrum, np.greater)[0]
peak_values = spectrum[peak_indices]
highest_peak_indices = np.argpartition(peak_values, -N)[-N:]
return peak_indices[(highest_peak_indices)]
def detection_plot(peaks) : #formats data for vertical line plotting
x = []
y = []
for peak in peaks :
x.append(peak*df)
x.append(peak*df)
y.append(-200)
y.append(0)
return x, y
##Main class containing loop and UI
class SpectrumAnalyzer(pg.GraphicsWindow) :
def __init__(self) :
super().__init__()
self.initUI()
self.initTimer()
self.initData()
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(format = FORMAT,
channels = CHANNELS,
rate = FS,
input = True,
output = True,
frames_per_buffer = CHUNK)
def initUI(self) :
self.setWindowTitle("Microphone Audio Data")
audio_plot = self.addPlot(title="Waveform")
audio_plot.showGrid(True, True)
audio_plot.addLegend()
audio_plot.setYRange(-1,1)
self.time_curve = audio_plot.plot()
self.nextRow()
fft_plot = self.addPlot(title="FFT")
fft_plot.showGrid(True, True)
fft_plot.addLegend()
fft_plot.setLogMode(True, False)
fft_plot.setYRange(-140,0) #may be adjusted depending on your input
self.fft_curve = fft_plot.plot(pen='y')
self.detection = fft_plot.plot(pen='r')
def initTimer(self) :
self.timer = QtCore.QTimer()
self.timer.timeout.connect(self.update)
self.timer.start(0)
def initData(self) :
self.waveform_data = np.zeros(PLOTSIZE)
self.fft_data = np.zeros(NFFT)
self.fft_counter = 0
def closeEvent(self, event) :
self.timer.stop()
self.stream.stop_stream()
self.stream.close()
self.pa.terminate()
def update(self) :
raw_data = self.stream.read(CHUNK)
self.stream.write(raw_data, CHUNK)
self.fft_counter += CHUNK
sample_data = np.fromstring(raw_data, dtype=np.float32)
self.waveform_data = np.concatenate([self.waveform_data, sample_data]) #update plot data
self.waveform_data = self.waveform_data[CHUNK:] #
self.time_curve.setData(self.waveform_data)
self.fft_data = np.concatenate([self.fft_data, sample_data]) #update fft input
self.fft_data = self.fft_data[CHUNK:] #
if self.fft_counter == HOP :
spectrum = db_spectrum(self.fft_data)
peaks = highest_peaks(spectrum)
x, y = detection_plot(peaks)
self.detection.setData(x, y, connect = 'pairs')
self.fft_curve.setData(freq_range, spectrum)
self.fft_counter = 0
if __name__ == '__main__':
spec = SpectrumAnalyzer()
The code works fine but I still have some questions :
I understand that by calling timer.start() with 0 as an argument, the update method is being called again as soon as possible. How does my script know that the update method needs be called only when the next audio chunk is received and not before ?
In the codes I linked above, the closeEvent method is not modified in order to stop the timer and the stream when closing the PyQtGraph window. What used to happen for me is that even after closing the window, the update method was being called and my audio recorded. Was that normal behavior ?
I've read that when using PyQt GUIs, I should always start by calling a QtGui.QApplication instance and call the exec method. Why is that and why is my code working even though I'm not doing it ?
In the future I will need to implement analysis that is much more demanding than just detecting the N highest peaks. Given the actual structure of my code, if I add such analysis in the update method, I understand that the CPU will have to compute everything before the next audio chunk is received, while it could wait for the next FFT input data to be ready. The hop size being larger than the chunk size, this will give the CPU more time to compute everything. How can I achieve this ? Multi-threading ?
I'm trying to record audio and simultaneously print the amplitude of the recorded signal. So I'm saving all datas in stream.read. But when I try to print them, I have a string of bytes and no integers. I would like to know how to convert these signs in order to get amplitude.
This is my code :
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data) # 2 bytes(16 bits) per channel
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
for data in frames:
print(data)
And this is what I obtain :
����# ����
!$
�� ���� ��������������������������
������ �� ��
��
�� ������ ����������������������������
��
����
������������������������������������������������������������������
���������
���������������
% �� ��(��)����,����.����%����#��
�� �� �����������������������
You can certainly inspire yourself by the following code :
#!/usr/bin/python
# open a microphone in pyAudio and listen for taps
import pyaudio
import struct
import math
INITIAL_TAP_THRESHOLD = 0.010
FORMAT = pyaudio.paInt16
SHORT_NORMALIZE = (1.0/32768.0)
CHANNELS = 2
RATE = 44100
INPUT_BLOCK_TIME = 0.05
INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME)
# if we get this many noisy blocks in a row, increase the threshold
OVERSENSITIVE = 15.0/INPUT_BLOCK_TIME
# if we get this many quiet blocks in a row, decrease the threshold
UNDERSENSITIVE = 120.0/INPUT_BLOCK_TIME
# if the noise was longer than this many blocks, it's not a 'tap'
MAX_TAP_BLOCKS = 0.15/INPUT_BLOCK_TIME
def get_rms( block ):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh"%(count)
shorts = struct.unpack( format, block )
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt( sum_squares / count )
class TapTester(object):
def __init__(self):
self.pa = pyaudio.PyAudio()
self.stream = self.open_mic_stream()
self.tap_threshold = INITIAL_TAP_THRESHOLD
self.noisycount = MAX_TAP_BLOCKS+1
self.quietcount = 0
self.errorcount = 0
def stop(self):
self.stream.close()
def find_input_device(self):
device_index = None
for i in range( self.pa.get_device_count() ):
devinfo = self.pa.get_device_info_by_index(i)
print( "Device %d: %s"%(i,devinfo["name"]) )
for keyword in ["mic","input"]:
if keyword in devinfo["name"].lower():
print( "Found an input: device %d - %s"% (i,devinfo["name"]) )
device_index = i
return device_index
if device_index == None:
print( "No preferred input found; using default input device." )
return device_index
def open_mic_stream( self ):
device_index = self.find_input_device()
stream = self.pa.open( format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
input_device_index = device_index,
frames_per_buffer = INPUT_FRAMES_PER_BLOCK)
return stream
def tapDetected(self):
print "Tap!"
def listen(self):
try:
block = self.stream.read(INPUT_FRAMES_PER_BLOCK)
except IOError, e:
# dammit.
self.errorcount += 1
print( "(%d) Error recording: %s"%(self.errorcount,e) )
self.noisycount = 1
return
amplitude = get_rms( block )
if amplitude > self.tap_threshold:
# noisy block
self.quietcount = 0
self.noisycount += 1
if self.noisycount > OVERSENSITIVE:
# turn down the sensitivity
self.tap_threshold *= 1.1
else:
# quiet block.
if 1 <= self.noisycount <= MAX_TAP_BLOCKS:
self.tapDetected()
self.noisycount = 0
self.quietcount += 1
if self.quietcount > UNDERSENSITIVE:
# turn up the sensitivity
self.tap_threshold *= 0.9
if __name__ == "__main__":
tt = TapTester()
for i in range(1000):
tt.listen()
It come from this post: [Detect tap with pyaudio from live mic
You can easyly adapt it to put the RMS in a table and plot the table.
PyAudio is giving you binary-encoded audio frames as bytes in a string. See the answer to this question for how to print a human-readable representation of your frames:
Get an audio sample as float number from pyaudio-stream
I guess the question is old and I stumpled over it looking for other answers, but in my project I use something like this.
#Lets assume the constants are defined somewhere
import struct
import pyaudio
import numpy as np
self.input = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=1,
rate=44100,
input=True,
output=False,
frames_per_buffer=1024,
)
wf_data = self.input.read(self.CHUNK)
wf_data = struct.unpack(str(self.CHUNK) + 'h', wf_data)
wf_data = np.array(wf_data)
the paInt16 and the 'h' correspond. You can figure out what letter matches your pyaudio format here.
https://docs.python.org/3/library/struct.html
Credit goes to:
https://www.youtube.com/channel/UC2W0aQEPNpU6XrkFCYifRFQ
I think you could do this
data = stream.read(CHUNK)
for each in data:
print(each)
When dealing with audio you probably want the RMS (root mean squared) value of the signals buffer. I believe it offers a better 'view' of the overall power in an audio signal.
The python standard library as a module called audioop the module has a function called rms.
import pyaudio
import time
import audioop
def get_rms():
# Creates a generator that can iterate rms values
CHUNK = 8
WIDTH = 2
CHANNELS = 1
RATE = 44100
p = pyaudio.PyAudio()
try:
stream = p.open(format=p.get_format_from_width(WIDTH),
channels=CHANNELS,
rate=RATE,
input=True,
output=False,
frames_per_buffer=CHUNK)
# wait a second to allow the stream to be setup
time.sleep(1)
while True:
# read the data
data = stream.read(CHUNK, exception_on_overflow = False)
rms = audioop.rms(data, 1)
yield rms_scaled
finally:
p.terminate()
stream.stop_stream()
stream.close()
You can use the function like this
rms_values = get_rms()
for rms in rms_values:
print(rms)
I need to capture audio clips as WAV files that I can then pass to another bit of python for processing. The problem is that I need to determine when there is audio present and then record it, stop when it goes silent and then pass that file to the processing module.
I'm thinking it should be possible with the wave module to detect when there is pure silence and discard it then as soon as something other than silence is detected start recording, then when the line goes silent again stop the recording.
Just can't quite get my head around it, can anyone get me started with a basic example.
As a follow up to Nick Fortescue's answer, here's a more complete example of how to record from the microphone and process the resulting data:
from sys import byteorder
from array import array
from struct import pack
import pyaudio
import wave
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 44100
def is_silent(snd_data):
"Returns 'True' if below the 'silent' threshold"
return max(snd_data) < THRESHOLD
def normalize(snd_data):
"Average the volume out"
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
"Trim the blank spots at the start and end"
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i)>THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
# Trim to the left
snd_data = _trim(snd_data)
# Trim to the right
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
"Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
silence = [0] * int(seconds * RATE)
r = array('h', silence)
r.extend(snd_data)
r.extend(silence)
return r
def record():
"""
Record a word or words from the microphone and
return the data as an array of signed shorts.
Normalizes the audio, trims silence from the
start and end, and pads with 0.5 seconds of
blank sound to make sure VLC et al can play
it without getting chopped off.
"""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
snd_started = False
r = array('h')
while 1:
# little endian, signed short
snd_data = array('h', stream.read(CHUNK_SIZE))
if byteorder == 'big':
snd_data.byteswap()
r.extend(snd_data)
silent = is_silent(snd_data)
if silent and snd_started:
num_silent += 1
elif not silent and not snd_started:
snd_started = True
if snd_started and num_silent > 30:
break
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == '__main__':
print("please speak a word into the microphone")
record_to_file('demo.wav')
print("done - result written to demo.wav")
I believe the WAVE module does not support recording, just processing existing files. You might want to look at PyAudio for actually recording.
WAV is about the world's simplest file format. In paInt16 you just get a signed integer representing a level, and closer to 0 is quieter. I can't remember if WAV files are high byte first or low byte, but something like this ought to work (sorry, I'm not really a python programmer:
from array import array
# you'll probably want to experiment on threshold
# depends how noisy the signal
threshold = 10
max_value = 0
as_ints = array('h', data)
max_value = max(as_ints)
if max_value > threshold:
# not silence
PyAudio code for recording kept for reference:
import pyaudio
import sys
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=chunk)
print "* recording"
for i in range(0, 44100 / chunk * RECORD_SECONDS):
data = stream.read(chunk)
# check for silence here by comparing the level with 0 (or some threshold) for
# the contents of data.
# then write data or not to a file
print "* done"
stream.stop_stream()
stream.close()
p.terminate()
Thanks to cryo for improved version that I based my tested code below:
#Instead of adding silence at start and end of recording (values=0) I add the original audio . This makes audio sound more natural as volume is >0. See trim()
#I also fixed issue with the previous code - accumulated silence counter needs to be cleared once recording is resumed.
from array import array
from struct import pack
from sys import byteorder
import copy
import pyaudio
import wave
THRESHOLD = 500 # audio levels not normalised.
CHUNK_SIZE = 1024
SILENT_CHUNKS = 3 * 44100 / 1024 # about 3sec
FORMAT = pyaudio.paInt16
FRAME_MAX_VALUE = 2 ** 15 - 1
NORMALIZE_MINUS_ONE_dB = 10 ** (-1.0 / 20)
RATE = 44100
CHANNELS = 1
TRIM_APPEND = RATE / 4
def is_silent(data_chunk):
"""Returns 'True' if below the 'silent' threshold"""
return max(data_chunk) < THRESHOLD
def normalize(data_all):
"""Amplify the volume out to max -1dB"""
# MAXIMUM = 16384
normalize_factor = (float(NORMALIZE_MINUS_ONE_dB * FRAME_MAX_VALUE)
/ max(abs(i) for i in data_all))
r = array('h')
for i in data_all:
r.append(int(i * normalize_factor))
return r
def trim(data_all):
_from = 0
_to = len(data_all) - 1
for i, b in enumerate(data_all):
if abs(b) > THRESHOLD:
_from = max(0, i - TRIM_APPEND)
break
for i, b in enumerate(reversed(data_all)):
if abs(b) > THRESHOLD:
_to = min(len(data_all) - 1, len(data_all) - 1 - i + TRIM_APPEND)
break
return copy.deepcopy(data_all[_from:(_to + 1)])
def record():
"""Record a word or words from the microphone and
return the data as an array of signed shorts."""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=True, frames_per_buffer=CHUNK_SIZE)
silent_chunks = 0
audio_started = False
data_all = array('h')
while True:
# little endian, signed short
data_chunk = array('h', stream.read(CHUNK_SIZE))
if byteorder == 'big':
data_chunk.byteswap()
data_all.extend(data_chunk)
silent = is_silent(data_chunk)
if audio_started:
if silent:
silent_chunks += 1
if silent_chunks > SILENT_CHUNKS:
break
else:
silent_chunks = 0
elif not silent:
audio_started = True
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
data_all = trim(data_all) # we trim before normalize as threshhold applies to un-normalized wave (as well as is_silent() function)
data_all = normalize(data_all)
return sample_width, data_all
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h' * len(data)), *data)
wave_file = wave.open(path, 'wb')
wave_file.setnchannels(CHANNELS)
wave_file.setsampwidth(sample_width)
wave_file.setframerate(RATE)
wave_file.writeframes(data)
wave_file.close()
if __name__ == '__main__':
print("Wait in silence to begin recording; wait in silence to terminate")
record_to_file('demo.wav')
print("done - result written to demo.wav")
import pyaudio
import wave
from array import array
FORMAT=pyaudio.paInt16
CHANNELS=2
RATE=44100
CHUNK=1024
RECORD_SECONDS=15
FILE_NAME="RECORDING.wav"
audio=pyaudio.PyAudio() #instantiate the pyaudio
#recording prerequisites
stream=audio.open(format=FORMAT,channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
#starting recording
frames=[]
for i in range(0,int(RATE/CHUNK*RECORD_SECONDS)):
data=stream.read(CHUNK)
data_chunk=array('h',data)
vol=max(data_chunk)
if(vol>=500):
print("something said")
frames.append(data)
else:
print("nothing")
print("\n")
#end of recording
stream.stop_stream()
stream.close()
audio.terminate()
#writing to file
wavfile=wave.open(FILE_NAME,'wb')
wavfile.setnchannels(CHANNELS)
wavfile.setsampwidth(audio.get_sample_size(FORMAT))
wavfile.setframerate(RATE)
wavfile.writeframes(b''.join(frames))#append frames recorded to file
wavfile.close()
I think this will help.It is a simple script which will check if there is a silence or not.If silence is detected it will not record otherwise it will record.
The pyaudio website has many examples that are pretty short and clear:
http://people.csail.mit.edu/hubert/pyaudio/
Update 14th of December 2019 - Main example from the above linked website from 2017:
"""PyAudio Example: Play a WAVE file."""
import pyaudio
import wave
import sys
CHUNK = 1024
if len(sys.argv) < 2:
print("Plays a wave file.\n\nUsage: %s filename.wav" % sys.argv[0])
sys.exit(-1)
wf = wave.open(sys.argv[1], 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(CHUNK)
while data != '':
stream.write(data)
data = wf.readframes(CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
You might want to look at csounds, also. It has several API's, including Python. It might be able to interact with an A-D interface and gather sound samples.