I am trying to run a colab file training openAI's jukebox, however when I try to run the function code which loads the audio, I am getting an error:
File "/content/jukebox/jukebox/data/files_dataset.py", line 82, in get_song_chunk
data, sr = load_audio(filename, sr=self.sr, offset=offset, duration=self.sample_length)
File "/content/jukebox/jukebox/utils/io.py", line 48, in load_audio
frame = frame.to_ndarray(format='fltp') # Convert to floats and not int16
AttributeError: 'list' object has no attribute 'to_ndarray'
It seems to be interpreting the frame input as a list, which when printed looks like this:
[<av.AudioFrame 0, pts=None, 778 samples at 22050Hz, stereo, fltp at
0x7fd03dd64150>]
When I try to change to frame = resampler.resample(frame) I get this error:
TypeError: 'av.audio.frame.AudioFrame' object cannot be interpreted as
an integer
I don't really know much about audio files so i'm not sure how to debug and would appreciate help here.
the full code to load the audio is below.
def load_audio(file, sr, offset, duration, resample=True, approx=False, time_base='samples', check_duration=True):
if time_base == 'sec':
offset = offset * sr
duration = duration * sr
# Loads at target sr, stereo channels, seeks from offset, and stops after duration
container = av.open(file)
audio = container.streams.get(audio=0)[0] # Only first audio stream
audio_duration = audio.duration * float(audio.time_base)
if approx:
if offset + duration > audio_duration*sr:
# Move back one window. Cap at audio_duration
offset = np.min(audio_duration*sr - duration, offset - duration)
else:
if check_duration:
assert offset + duration <= audio_duration*sr, f'End {offset + duration} beyond duration {audio_duration*sr}'
if resample:
resampler = av.AudioResampler(format='fltp',layout='stereo', rate=sr)
else:
assert sr == audio.sample_rate
offset = int(offset / sr / float(audio.time_base)) #int(offset / float(audio.time_base)) # Use units of time_base for seeking
duration = int(duration) #duration = int(duration * sr) # Use units of time_out ie 1/sr for returning
sig = np.zeros((2, duration), dtype=np.float32)
container.seek(offset, stream=audio)
total_read = 0
for frame in container.decode(audio=0): # Only first audio stream
if resample:
frame.pts = None
frame = resampler.resample(frame)
frame = frame.to_ndarray(format='fltp') # Convert to floats and not int16
read = frame.shape[-1]
if total_read + read > duration:
read = duration - total_read
sig[:, total_read:total_read + read] = frame[:, :read]
total_read += read
if total_read == duration:
break
assert total_read <= duration, f'Expected {duration} frames, got {total_read}'
return sig, sr
If your variable frame is interpreted as a list, you could replace frame = resampler.resample(frame) with frame = resampler.resample(frame)[0]. Your code ran without errors once I made this edit.
Try replacing frame = frame.to_ndarray(format='fltp') by a direct assignation of the variable frame:
import numpy as np
#frame = frame.to_ndarray(format='fltp') #Original line
frame = np.ndarray(frame)
If you want it to be a specific data type, you can change the dtype argument of the ndarray function:
frame = np.ndarray(frame, dtype=np.float32)
Try: frame = frame[0].to_ndarray(format='fltp')
Related
I am able to read the audio but I am getting an error message while passing it to VAD(Voice Activity Detector). I think the error message is because the frames is in bytes, when feeding it to vad.is_speech(frame, sample_rate), should this frame be in bytes?
Here is the code below:
frame_duration_ms=10
duration_in_ms = (frame_duration_ms / 1000) #duration in 10ms
frame_size = int(sample_rate * duration_in_ms) #frame size of 160
frame_bytes = frame_size * 2
def frame_generator(buffer, frame_bytes):
# repeatedly store 320 length array to the frame_stored when the frame_bytes is less than the size of the buffer
while offset+frame_bytes < len(buffer):
frame_stored = buffer[offset : offset+frame_bytes]
offset = offset + frame_bytes
return frame_stored
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
# use deque for the sliding window
ring_buffer = deque(maxlen=num_padding_frames)
# we have two states TRIGGERED and NOTTRIGGERED state
triggered = True #NOTTRIGGERED state
frames = frame_generator(buffer, frame_bytes)
speech_frame = []
for frame in frames:
is_speech = vad.is_speech(frame, sample_rate)
Here is the error message:
TypeError Traceback (most recent call
last) in
16 speech_frame = []
17 for frame in frames:
---> 18 is_speech = vad.is_speech(frame, sample_rate)
19 #print(frames)
C:\Program Files\Python38\lib\site-packages\webrtcvad.py in
is_speech(self, buf, sample_rate, length)
20
21 def is_speech(self, buf, sample_rate, length=None):
---> 22 length = length or int(len(buf) / 2)
23 if length * 2 > len(buf):
24 raise IndexError(
TypeError: object of type 'int' has no len()
I have solved it, you know vad.is_speech(buf=frame, sample_rate), it takes the buf and calculates it length, but an integer value does not posses the len() attributes in python.
This throws an error for example:
num = 1
print(len(num))
Use this instead:
data = [1,2,3,4]
print(len(data))
So here is the correction to the code below:
frame_duration_ms=10
duration_in_ms = (frame_duration_ms / 1000) #duration in 10ms
frame_size = int(sample_rate * duration_in_ms) #frame size of 160
frame_bytes = frame_size * 2
values = []
def frame_generator(buffer, frame_bytes):
# repeatedly store 320 length array to the frame_stored when the frame_bytes is less than the size of the buffer
while offset+frame_bytes < len(buffer):
frame_stored = buffer[offset : offset+frame_bytes]
offset = offset + frame_bytes
values.append(frame_stored)
return values
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
# use deque for the sliding window
ring_buffer = deque(maxlen=num_padding_frames)
# we have two states TRIGGERED and NOTTRIGGERED state
triggered = True #NOTTRIGGERED state
frames = frame_generator(buffer, frame_bytes)
frame = []
for frame in frames:
is_speech = vad.is_speech(frame, sample_rate)
import wave
import webrtcvad
# Initialize a vad object
audioFile = wave.open('ENG_M.wav')
framesAudio = audioFile.readframes(800)
#print(fraud.frames)
vad = webrtcvad.Vad()
# Run the VAD on 10 ms of silence and 16000 sampling rate
sample_rate = 16000
frame_duration = 10 # in ms
for f in framesAudio :
# Detecting speech
final_frame = f.to_bytes(2,"big")* int(sample_rate * frame_duration / 1000)
print(f'Contains speech: {vad.is_speech(final_frame, sample_rate)}')
I have been trying to understand the error reported when I run an example supplied with some hardware i purchased.
I have tried googling around but every answer I get is a bit beyond my comprehension. I think what is going wrong is that the script, or one of the imported scripts is written for Python 2 and i am trying to run it in python 3.
When I try and run it in Python 2 i get a whole host of other problems so I have been trying to make it work with 3.
The hardware I purchased is the Enviro+ sensor suite for the raspberry pi sold by Pimoroni
Hardware Link
Github Library
Pimoroni Tutorial
#!/usr/bin/env python
import time
import colorsys
import os
import sys
import ST7735
import ltr559
from bme280 import BME280
from pms5003 import PMS5003
from enviroplus import gas
from subprocess import PIPE, Popen
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
print("""all-in-one.py - Displays readings from all of Enviro plus' sensors
Press Ctrl+C to exit!
""")
# BME280 temperature/pressure/humidity sensor
bme280 = BME280()
# PMS5003 particulate sensor
pms5003 = PMS5003()
# Create ST7735 LCD display class
st7735 = ST7735.ST7735(
port=0,
cs=1,
dc=9,
backlight=12,
rotation=270,
spi_speed_hz=10000000
)
# Initialize display
st7735.begin()
WIDTH = st7735.width
HEIGHT = st7735.height
# Set up canvas and font
img = Image.new('RGB', (WIDTH, HEIGHT), color=(0, 0, 0))
draw = ImageDraw.Draw(img)
path = os.path.dirname(os.path.realpath(__file__))
font = ImageFont.truetype(path + "/fonts/Asap/Asap-Bold.ttf", 20)
message = ""
# The position of the top bar
top_pos = 25
# Displays data and text on the 0.96" LCD
def display_text(variable, data, unit):
# Maintain length of list
values[variable] = values[variable][1:] + [data]
# Scale the values for the variable between 0 and 1
colours = [(v - min(values[variable]) + 1) / (max(values[variable])
- min(values[variable]) + 1) for v in values[variable]]
# Format the variable name and value
message = "{}: {:.1f} {}".format(variable[:4], data, unit)
print(message)
draw.rectangle((0, 0, WIDTH, HEIGHT), (255, 255, 255))
for i in range(len(colours)):
# Convert the values to colours from red to blue
colour = (1.0 - colours[i]) * 0.6
r, g, b = [int(x * 255.0) for x in colorsys.hsv_to_rgb(colour,
1.0, 1.0)]
# Draw a 1-pixel wide rectangle of colour
draw.rectangle((i, top_pos, i+1, HEIGHT), (r, g, b))
# Draw a line graph in black
line_y = HEIGHT - (top_pos + (colours[i] * (HEIGHT - top_pos)))\
+ top_pos
draw.rectangle((i, line_y, i+1, line_y+1), (0, 0, 0))
# Write the text at the top in black
draw.text((0, 0), message, font=font, fill=(0, 0, 0))
st7735.display(img)
# Get the temperature of the CPU for compensation
def get_cpu_temperature():
process = Popen(['vcgencmd', 'measure_temp'], stdout=PIPE)
output, _error = process.communicate()
return float(output[output.index('=') + 1:output.rindex("'")])
# Tuning factor for compensation. Decrease this number to adjust the
# temperature down, and increase to adjust up
factor = 0.8
cpu_temps = [0] * 5
delay = 0.5 # Debounce the proximity tap
mode = 0 # The starting mode
last_page = 0
light = 1
# Create a values dict to store the data
variables = ["temperature",
"pressure",
"humidity",
"light",
"oxidised",
"reduced",
"nh3",
"pm1",
"pm25",
"pm10"]
values = {}
for v in variables:
values[v] = [1] * WIDTH
# The main loop
try:
while True:
proximity = ltr559.get_proximity()
# If the proximity crosses the threshold, toggle the mode
if proximity > 1500 and time.time() - last_page > delay:
mode += 1
mode %= len(variables)
last_page = time.time()
# One mode for each variable
if mode == 0:
variable = "temperature"
unit = "C"
cpu_temp = get_cpu_temperature()
# Smooth out with some averaging to decrease jitter
cpu_temps = cpu_temps[1:] + [cpu_temp]
avg_cpu_temp = sum(cpu_temps) / float(len(cpu_temps))
raw_temp = bme280.get_temperature()
data = raw_temp - ((avg_cpu_temp - raw_temp) / factor)
display_text(variable, data, unit)
if mode == 1:
variable = "pressure"
unit = "hPa"
data = bme280.get_pressure()
display_text(variable, data, unit)
if mode == 2:
variable = "humidity"
unit = "%"
data = bme280.get_humidity()
display_text(variable, data, unit)
if mode == 3:
variable = "light"
unit = "Lux"
if proximity < 10:
data = ltr559.get_lux()
else:
data = 1
display_text(variable, data, unit)
if mode == 4:
variable = "oxidised"
unit = "kO"
data = gas.read_all()
data = data.oxidising / 1000
display_text(variable, data, unit)
if mode == 5:
variable = "reduced"
unit = "kO"
data = gas.read_all()
data = data.reducing / 1000
display_text(variable, data, unit)
if mode == 6:
variable = "nh3"
unit = "kO"
data = gas.read_all()
data = data.nh3 / 1000
display_text(variable, data, unit)
if mode == 7:
variable = "pm1"
unit = "ug/m3"
data = pms5003.read()
data = data.pm_ug_per_m3(1.0)
display_text(variable, data, unit)
if mode == 8:
variable = "pm25"
unit = "ug/m3"
data = pms5003.read()
data = data.pm_ug_per_m3(2.5)
display_text(variable, data, unit)
if mode == 9:
variable = "pm10"
unit = "ug/m3"
data = pms5003.read()
data = data.pm_ug_per_m3(10)
display_text(variable, data, unit)
# Exit cleanly
except KeyboardInterrupt:
sys.exit(0)
When I try and run the code i get the following results:
Traceback (most recent call last):
File "all-in-one.py", line 135, in <module>
cpu_temp = get_cpu_temperature()
File "all-in-one.py", line 89, in get_cpu_temperature
return float(output[output.index('=') + 1:output.rindex("'")])
TypeError: argument should be integer or bytes-like object, not 'str'
Please forgive me if I have not filled this help request out correctly - i am very new to forums (I hardly ever post in them, although i read them a lot for help), and i am also very new to Python and Linux.
Any help and support from the community would be massively appreciated - thank you in advance...
SW
According to Python 3 whitepages on subprocess.communicate(), the type of output and _error can be either strings (what you want) OR bytes. If you were getting strings back, you wouldn't have this problem, but the TypeError message you're getting is exactly what you get when you try to call index() on a bytes object with a string argument.
Demonstrably:
>>> output = "temperature = '88 C'".encode('utf-8') #this is of type bytes
>>> output
b"temperature = '88 C'"
>>> output.index('=')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: argument should be integer or bytes-like object, not 'str'
>>> output.index(ord('='))
12
So you should replace output.index('=') with output.index(ord('=')) and output.rindex("'") with output.rindex(ord("'")).
EDIT
I realized this much later, but you can circumvent using ord() by prepending your search string with a b.
output.index(b'=')
I am trying to use aubio and python for a school project, here's the goal : detect when someone emit two sounds, each with a length of 2s, and with an interval between them of max 3s. The second one need to be higher than the first one. When these conditions are met, the program send a Wake-On-Lan package (not implemented in current code).
import alsaaudio
import numpy as np
import aubio
import time
import threading
class Audio_watcher:
# constants
samplerate = 44100
win_s = 2048
hop_s = win_s // 2
framesize = hop_s
nb_samples = 20
tone_duration = 2.0
per_sampling = tone_duration / nb_samples
tone_max_interval = 3.0
tone_diff_ratio = 2
def __init__(self):
self.last_frequencies = np.zeros(Audio_watcher.nb_samples)
self.last_energies = np.zeros(Audio_watcher.nb_samples)
self.detected_tone = 0
# set up audio input
recorder = alsaaudio.PCM(type=alsaaudio.PCM_CAPTURE)
recorder.setperiodsize(Audio_watcher.framesize)
recorder.setrate(Audio_watcher.samplerate)
recorder.setformat(alsaaudio.PCM_FORMAT_FLOAT_LE)
recorder.setchannels(1)
self.recorder = recorder
pitcher = aubio.pitch("default", Audio_watcher.win_s, Audio_watcher.hop_s, Audio_watcher.samplerate)
pitcher.set_unit("Hz")
pitcher.set_silence(-40)
self.pitcher = pitcher
# A filter
f = aubio.digital_filter(7)
f.set_a_weighting(Audio_watcher.samplerate)
self.f = f
def get_audio(self):
# read and convert data from audio input
_, data = self.recorder.read()
samples = np.fromstring(data, dtype=aubio.float_type)
filtered_samples = self.f(samples)
print(filtered_samples)
# pitch and energy of current frame
freq = self.pitcher(filtered_samples)[0]
print(freq)
self.last_frequencies = np.roll(self.last_frequencies, 1)
self.last_frequencies[0] = freq
self.last_energies = np.roll(self.last_energies, 1)
self.last_energies[0] = np.sum(filtered_samples**2)/len(filtered_samples)
threading.Timer(Audio_watcher.per_sampling, self.get_audio).start()
def reset_detected_tone():
self.detected_tone = 0
def detect_tone(self):
std_last = np.std(self.last_frequencies)
if std_last <= 200 and std_last > 0:
mean_freq = np.mean(self.last_frequencies)
if self.detected_tone == 0:
self.detected_tone = mean_freq
threading.Timer(Audio_watcher.tone_max_interval, self.reset_detected_tone).start()
elif mean_freq > Audio_watcher.tone_diff_ratio * self.detected_tone:
print('wol')
threading.Timer(Audio_watcher.tone_duration, self.detect_tone).start()
aw = Audio_watcher()
aw.get_audio()
aw.detect_tone()
However with this code I get a great delay between the sounds and their detection, I think it has to do with the recorder being called only one time every 0.1s, but I can't find how to give correct parameters to aubio.
Does anyone knows how to configure the constants so it works ?
Thanks a lot !
Found out what was causing this error, I needed to put the code that sets up the audio input in the get_audio function so it renewed everytime
I'm trying to record audio and simultaneously print the amplitude of the recorded signal. So I'm saving all datas in stream.read. But when I try to print them, I have a string of bytes and no integers. I would like to know how to convert these signs in order to get amplitude.
This is my code :
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data) # 2 bytes(16 bits) per channel
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
for data in frames:
print(data)
And this is what I obtain :
����# ����
!$
�� ���� ��������������������������
������ �� ��
��
�� ������ ����������������������������
��
����
������������������������������������������������������������������
���������
���������������
% �� ��(��)����,����.����%����#��
�� �� �����������������������
You can certainly inspire yourself by the following code :
#!/usr/bin/python
# open a microphone in pyAudio and listen for taps
import pyaudio
import struct
import math
INITIAL_TAP_THRESHOLD = 0.010
FORMAT = pyaudio.paInt16
SHORT_NORMALIZE = (1.0/32768.0)
CHANNELS = 2
RATE = 44100
INPUT_BLOCK_TIME = 0.05
INPUT_FRAMES_PER_BLOCK = int(RATE*INPUT_BLOCK_TIME)
# if we get this many noisy blocks in a row, increase the threshold
OVERSENSITIVE = 15.0/INPUT_BLOCK_TIME
# if we get this many quiet blocks in a row, decrease the threshold
UNDERSENSITIVE = 120.0/INPUT_BLOCK_TIME
# if the noise was longer than this many blocks, it's not a 'tap'
MAX_TAP_BLOCKS = 0.15/INPUT_BLOCK_TIME
def get_rms( block ):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh"%(count)
shorts = struct.unpack( format, block )
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt( sum_squares / count )
class TapTester(object):
def __init__(self):
self.pa = pyaudio.PyAudio()
self.stream = self.open_mic_stream()
self.tap_threshold = INITIAL_TAP_THRESHOLD
self.noisycount = MAX_TAP_BLOCKS+1
self.quietcount = 0
self.errorcount = 0
def stop(self):
self.stream.close()
def find_input_device(self):
device_index = None
for i in range( self.pa.get_device_count() ):
devinfo = self.pa.get_device_info_by_index(i)
print( "Device %d: %s"%(i,devinfo["name"]) )
for keyword in ["mic","input"]:
if keyword in devinfo["name"].lower():
print( "Found an input: device %d - %s"% (i,devinfo["name"]) )
device_index = i
return device_index
if device_index == None:
print( "No preferred input found; using default input device." )
return device_index
def open_mic_stream( self ):
device_index = self.find_input_device()
stream = self.pa.open( format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
input_device_index = device_index,
frames_per_buffer = INPUT_FRAMES_PER_BLOCK)
return stream
def tapDetected(self):
print "Tap!"
def listen(self):
try:
block = self.stream.read(INPUT_FRAMES_PER_BLOCK)
except IOError, e:
# dammit.
self.errorcount += 1
print( "(%d) Error recording: %s"%(self.errorcount,e) )
self.noisycount = 1
return
amplitude = get_rms( block )
if amplitude > self.tap_threshold:
# noisy block
self.quietcount = 0
self.noisycount += 1
if self.noisycount > OVERSENSITIVE:
# turn down the sensitivity
self.tap_threshold *= 1.1
else:
# quiet block.
if 1 <= self.noisycount <= MAX_TAP_BLOCKS:
self.tapDetected()
self.noisycount = 0
self.quietcount += 1
if self.quietcount > UNDERSENSITIVE:
# turn up the sensitivity
self.tap_threshold *= 0.9
if __name__ == "__main__":
tt = TapTester()
for i in range(1000):
tt.listen()
It come from this post: [Detect tap with pyaudio from live mic
You can easyly adapt it to put the RMS in a table and plot the table.
PyAudio is giving you binary-encoded audio frames as bytes in a string. See the answer to this question for how to print a human-readable representation of your frames:
Get an audio sample as float number from pyaudio-stream
I guess the question is old and I stumpled over it looking for other answers, but in my project I use something like this.
#Lets assume the constants are defined somewhere
import struct
import pyaudio
import numpy as np
self.input = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=1,
rate=44100,
input=True,
output=False,
frames_per_buffer=1024,
)
wf_data = self.input.read(self.CHUNK)
wf_data = struct.unpack(str(self.CHUNK) + 'h', wf_data)
wf_data = np.array(wf_data)
the paInt16 and the 'h' correspond. You can figure out what letter matches your pyaudio format here.
https://docs.python.org/3/library/struct.html
Credit goes to:
https://www.youtube.com/channel/UC2W0aQEPNpU6XrkFCYifRFQ
I think you could do this
data = stream.read(CHUNK)
for each in data:
print(each)
When dealing with audio you probably want the RMS (root mean squared) value of the signals buffer. I believe it offers a better 'view' of the overall power in an audio signal.
The python standard library as a module called audioop the module has a function called rms.
import pyaudio
import time
import audioop
def get_rms():
# Creates a generator that can iterate rms values
CHUNK = 8
WIDTH = 2
CHANNELS = 1
RATE = 44100
p = pyaudio.PyAudio()
try:
stream = p.open(format=p.get_format_from_width(WIDTH),
channels=CHANNELS,
rate=RATE,
input=True,
output=False,
frames_per_buffer=CHUNK)
# wait a second to allow the stream to be setup
time.sleep(1)
while True:
# read the data
data = stream.read(CHUNK, exception_on_overflow = False)
rms = audioop.rms(data, 1)
yield rms_scaled
finally:
p.terminate()
stream.stop_stream()
stream.close()
You can use the function like this
rms_values = get_rms()
for rms in rms_values:
print(rms)
I'm trying to process an audio signal and divide an audio signal into N discrete samples and then I want to play that samples independently.
How can I do this using python?
import wave
import pygame
import time
def slice(infile, outfilename, start_ms, end_ms):
width = infile.getsampwidth() #Returns sample width in bytes
rate = infile.getframerate() #Returns sampling frequency
fpms = rate / 1000 # frames per ms
length = (end_ms - start_ms) * fpms
start_index = start_ms * fpms
out = wave.open(outfilename, "w")
out.setparams((infile.getnchannels(), width, rate, length, infile.getcomptype(), infile.getcompname()))
infile.rewind() #Rewind the file pointer to the beginning of the audio stream
anchor = infile.tell() #Return current file pointer position
infile.setpos(anchor + start_index) #Set the file pointer to the specified position
out.writeframes(infile.readframes(length)) #Write audio frames and make sure nframes is correct
if __name__ == "__main__":
slice(wave.open("song1.wav", "r"), "out.wav", 500, 5000)
pygame.mixer.init()
pygame.mixer.music.load("out.wav")
pygame.mixer.music.play()
while pygame.mixer.music.get_busy() == True:
continue