I'm trying to modify my first Python program. I'm trying to use this repository to do some rudimentary text-to-speech. It does fine, but I want to improve it.
From the looks of it, there is a 0.145 second delay between samples played. Not all the samples of my voice will be 0.145 seconds, however, and I want to have each sample play one after the other with no delays or skips.
import re
import wave
import pyaudio
import _thread
import time
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict:str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ',2)
self._l[key] = re.findall(r"[A-Z]+",val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+",str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
delay = 0.0
for pron in list_pron:
_thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
delay += 0.145
def _play_audio(sound, delay):
try:
time.sleep(delay)
wf = wave.open("sounds/"+sound+".wav", 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
except:
pass
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I've tried getting rid of the threading and delay, but there is some delay still between samples. I'm thinking that I should, instead of incrementing delay by 0.145, increment it by the length of the sample in seconds, but I've looked at the pyaudio documentation, and I have no idea how to do that.
Can someone help?
Here is a modified code that plays wav files continuously.
import re
import wave
import pyaudio
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict='cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict: str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ', 2)
self._l[key] = re.findall(r"[A-Z]+", val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+", str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
# pyaudio set up.
# This open method assume all wave files have the same format.
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(2),
channels=2,
rate=44100,
output=True,
frames_per_buffer=self.CHUNK)
# play each wav file contineuously
for pron in list_pron:
with wave.open("sounds/"+pron+".wav", 'rb') as wf:
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
Related
I'm trying to catch chunks of an mp3 webstream and decoding them into PCM samples for signal processing. I tried to catch the audio via requests and io.BytesIO to save the data as .wav file.
I have to convert the mp3 data to wav data, but I don't know how. (My goal is not to record a .wav file, i am just doing this to test the algorithm.)
I found the pymedia lib, but it is very old (last commit in 2006), using python 2.7 and for me not installable.
Maybe it is possible with ffmpeg-python, but I have just seen examples using files as input and output.
Here's my code:
import requests
import io
import soundfile as sf
import struct
import wave
import numpy as np
def main():
stream_url = r'http://dg-wdr-http-dus-dtag-cdn.cast.addradio.de/wdr/1live/diggi/mp3/128/stream.mp3'
r = requests.get(stream_url, stream=True)
sample_array = []
try:
for block in r.iter_content(1024):
data, samplerate = sf.read(io.BytesIO(block), format="RAW", channels=2, samplerate=44100, subtype='FLOAT',
dtype='float32')
sample_array = np.append(sample_array, data)
except KeyboardInterrupt:
print("...saving")
obj = wave.open('sounds/stream1.wav', 'w')
obj.setnchannels(1) # mono
obj.setsampwidth(2) # bytes
obj.setframerate(44100)
data_max = np.nanmax(abs(sample_array))
# fill WAV with samples from sample_array
for sample in sample_array:
if (np.isnan(sample) or np.isnan(32760 * sample / data_max)) is True:
continue
try:
value = int(32760 * sample / data_max) # normalization INT16
except ValueError:
value = 1
finally:
data = struct.pack('<h', value)
obj.writeframesraw(data)
obj.close()
print("end")
if __name__ == '__main__':
main()
Do you have an idea how to handle this problem?
You are missing the decoding of mp3 stream. You are just saving mp3 file as wav.
You first need to decode mp3 audio. Which will give you PCM samples + audio info.
With the help from Irmen and his "miniaudio" and "synthesizer" library, I could solve the problem.
The problem was, that most radio webstreams uses the ICECAST protocol, which includes interleaved metadata information, so you can't decode it directly.
With the example script https://github.com/irmen/synthesizer/blob/master/examples/internetradio.py as template, I could write a script, which records a webstream until KeyboardInterrupt and saves it as a .wav file.
Here's the main part I edited:
...
def _audio_playback(self, pcm_stream):
sample_array = None
with Output(mixing="sequential", frames_per_chunk=44100 // 4) as output:
print("begin recording")
while self.decode_flag:
try:
audio = pcm_stream.read(44100 * 2 * 2 // 20)
if not audio:
break
except (IOError, ValueError):
break
else:
sample = Sample.from_raw_frames(audio, 2, 44100, 2)
if sample_array is None:
sample_array = sample.get_frames_numpy_float()
else:
sample_array = np.append(sample_array, sample.get_frames_numpy_float(), axis=0)
print("...saving")
wavf.write(self.file_location, 44100, sample_array)
print("saved")
...
Based on Bendzko answer here is my code:
pip install pyaudio miniaudio
import threading
import urllib.request
import time
try:
import miniaudio
except ImportError:
miniaudio = None
import pyaudio
import ctypes
import sys
CHUNK = 4096
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,channels=2,rate=44100,output=True)
class RadioThread(threading.Thread):
def run(self):
self.url = "https://impradio.bytemasters.gr/8002/stream"
#run in threading
client = miniaudio.IceCastClient(self.url)
pcm_stream = MiniaudioDecoderPcmStream(client.audio_format,client)
self.audio_playback(pcm_stream)
def audio_playback(self,pcm_stream):
global stop_peradio_thread
while stop_peradio_thread==False:
try:
audio = pcm_stream.read(CHUNK)
stream.write(audio.tobytes())
except:
pass
class MiniaudioDecoderPcmStream(miniaudio.StreamableSource):
def __init__(self, fmt, stream):
self.pcm_stream = miniaudio.stream_any(stream, fmt, dither=miniaudio.DitherMode.TRIANGLE)
def read(self, size):
try:
return self.pcm_stream.send(size)
except StopIteration:
return b""
def main():
global stop_peradio_thread
stop_peradio_thread = False
t1 = RadioThread()
t1.start()
while True:
try:
time.sleep(1)
except KeyboardInterrupt:
stop_peradio_thread = True
t1.join()
sys.exit()
main()
I want to record a audio track and save it 2 diffrent .wav files. The audio tracks should be saved with a delay of ca. 6 seconds and each .wav should be 12 seconds long.
I tried to do it with multiprocessing and pyaudio, but i cant manage to get it working
Please note that i am a beginner in python and that this is my first post on stackoverflow!
def func1():
#Record and save a 12 seconds long .wav
def func2():
#Record and save a 12 seconds long .wav
if __name__ == '__main__':
p1 = Process(target=func1)
p1.start()
p2 = Process(target=func2)
p2.start()
p1.join()
p2.join()
#start func2 6 seconds after func1
I would expect a data structure like this:
|---1.wav---|---1.wav---|---1.wav---|
|---2.wav---|---2.wav---|---2.wav---|
6sec 12sec 18sec 24sec 30sec 36sec 42sec
EDIT:
I came up with a bit of code that seems to work kind of well. It has a delay of .144 seconds. I am happy about improvement od this code. This code uses threading instead of multiprocessing.
import pyaudio
import wave
from threading import Thread
import time
from datetime import datetime
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
CHUNK = 1024
CHUNK1 = 1024
RECORD_SECONDS = 12
WAVE_OUTPUT_FILENAME1 = name = "outputs/output_1"+datetime.now().strftime("%m:%d:%Y-")
WAVE_OUTPUT_FILENAME2 = name = "outputs/output_2"+datetime.now().strftime("%m:%d:%Y-")
def func1():
while 1==1:
global FORMAT
global CHANNELS
global RATE
global CHUNK
global RECORD_SECONDS
global WAVE_OUTPUT_FILENAME1
WAVE_OUTPUT_FILENAME1 = name = "outputs/output1_"#+datetime.now().strftime("%m:%d:%Y-")
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("recording...")
frames = []
WAVE_OUTPUT_FILENAME1 = WAVE_OUTPUT_FILENAME1+datetime.now().strftime("%H;%M;%S.%f--")
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
WAVE_OUTPUT_FILENAME1 = WAVE_OUTPUT_FILENAME1 + datetime.now().strftime("%H;%M;%S.%f")+".wav"
print("finished recording")
# stop Recording
stream.stop_stream()
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME1, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()
def func2():
time.sleep(6)
while 1==1:
global FORMAT
global CHANNELS
global RATE
global CHUNK1
global RECORD_SECONDS
global WAVE_OUTPUT_FILENAME2
WAVE_OUTPUT_FILENAME2 = name = "outputs/output2_"#+datetime.now().strftime("%m:%d:%Y-")
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK1)
print("recording...")
frames = []
WAVE_OUTPUT_FILENAME2 = WAVE_OUTPUT_FILENAME2+datetime.now().strftime("%H;%M;%S.%f--")
for i in range(0, int(RATE / CHUNK1 * RECORD_SECONDS)):
data = stream.read(CHUNK1)
frames.append(data)
WAVE_OUTPUT_FILENAME2 = WAVE_OUTPUT_FILENAME2 + datetime.now().strftime("%H;%M;%S.%f")+".wav"
print("finished recording")
# stop Recording
stream.stop_stream()
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME2, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()
if __name__ == '__main__':
Thread(target = func1).start()
Thread(target = func2).start()
why do you think you need multiprocessing? I think it just complicates things
how about just recording in 6 second (or smaller) chunks/frames and write the correct frames to each file.
I've got a bit carried away, and written a nice class to do this:
import pyaudio
import wave
import time
class OverlappedRecorder:
def __init__(
self, secs_per_file, secs_between_file, *,
num_channels=2, sample_rate=48000,
sample_format=pyaudio.paInt16,
):
# various constants needed later
self.num_channels = num_channels
self.sample_width = pyaudio.get_sample_size(sample_format)
self.sample_rate = sample_rate
self.frames_between_start = int(secs_between_file * sample_rate)
self.frames_per_file = int(secs_per_file * sample_rate)
# mutable state needed to keep everything going
self.files = []
self.frames_till_next_file = 0
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
format=sample_format, channels=num_channels,
rate=sample_rate, frames_per_buffer=1024,
input=True, start=False,
stream_callback=self._callback,
)
def sleep_while_active(self):
while self.stream.is_active():
time.sleep(0.2)
def begin_wave_file(self):
"internal function to start a new WAV file"
path = time.strftime(
'recording-%Y-%m-%d-%H.%M.%S.wav',
time.localtime()
)
file = wave.open(path, 'wb')
file.setnchannels(self.num_channels)
file.setsampwidth(self.sample_width)
file.setframerate(self.sample_rate)
self.files.append(file)
# context manager stuff, recording starts when entered using "with"
def __enter__(self):
self.stream.start_stream()
return self
# exiting shuts everything down
def __exit__(self, exc_type, exc_val, exc_tb):
self.stream.stop_stream()
self.stream.close()
self.pa.terminate()
for file in self.files:
file.close()
# called by pyaudio when a new set of frames are ready
def _callback(self, data, frame_count, time_info, status):
self.frames_till_next_file -= frame_count
# see if we need to start a new file
if self.frames_till_next_file < 0:
self.frames_till_next_file += self.frames_between_start
self.begin_wave_file()
# can't remove from lists while iterating
# keep a list of files to close and remove later
done = []
for file in self.files:
remain = self.frames_per_file - file.getnframes()
# add appropriate amount of data to all open files
if frame_count < remain:
file.writeframesraw(data)
else:
remain *= self.sample_width * self.num_channels
file.writeframesraw(data[:remain])
done.append(file)
# close anything that finished
for file in done:
file.close()
self.files.remove(file)
# tell pyaudio to keep going
return (None, pyaudio.paContinue)
basic usage is: create an object, enter it using with and it'll start recording, and when you exit it'll stop and clean up.
rec = OverlappedRecorder(12, 6)
with rec:
time.sleep(30)
will let it run for 30 seconds, or you could do:
with OverlappedRecorder(12, 6) as rec:
rec.sleep_while_active()
to let it run until you hit Ctrl+C to kill the program, or you could put a call to input() in there to make it stop when you press enter, or whatever else you like.
a few comments on the code you posted:
you only need to declare global variables if you're going to modify them
why do you have seperate functions? why not just have a single function, and just delay start()ing the second Thread
why are you setting WAVE_OUTPUT_FILENAME1 so many times? just save the start_time and end_time, then format the string in one go
you don't have to read() in chunks, if you know it's going to fit in memory just read everything all in one go
you shouldn't need to keep starting and stopping recording, just open it once in each thread and if you're lucky samples will accumulate in the buffer while you're writing the wav file to disk
something like:
import pyaudio
import wave
import time
from datetime import datetime
from threading import Thread
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 12
def recorder(prefix):
audio = pyaudio.PyAudio()
stream = audio.open(
format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
)
try:
while True:
start_time = datetime.now()
print("recording started", start_time)
data = stream.read(RATE * RECORD_SECONDS, False)
end_time = datetime.now()
print("finished", end_time)
name = f'{prefix}{start_time:%Y-%m-%d-%H-%M-%S.%f}-{end_time:%H-%M-%S.%f}.wav'
print("writing", name)
with wave.open(name, 'wb') as waveFile:
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(data)
finally:
stream.stop_stream()
stream.close()
audio.terminate()
if __name__ == '__main__':
Thread(target=recorder, args=('outputs/output_1-',)).start()
time.sleep(6)
Thread(target=recorder, args=('outputs/output_2-',)).start()
a few differences:
the version using threading is much less code!
my version allows an arbitrary number of files without using up multiple OS threads for each file (there's the Python thread and pyaudio has an internal thread looking after audio buffers)
my version saves partial files
hope all the helps / makes sense!
I am trying to write a Python script that will record 5-second segments of speech on a loop for as long as the user is speaking, and will stop after three wave files of pure silence. How would I go about this?
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
frames_per_buffer = CHUNK)
print("* recording")
frames = []
for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
I think you can use https://github.com/rhasspy/rhasspy-silence library
Here is a little code for u (not mine, but it works.):
from rhasspysilence import WebRtcVadRecorder,VoiceCommand, VoiceCommandResult
import threading
import dataclasses
import typing
from queue import Queue
import json
import io
import os
from pathlib import Path
import shlex
import time
import wave
import sys
import subprocess
import pyaudio
pa =pyaudio.PyAudio()
#you can change the options (these are default settings)
vad_mode = 3
sample_rate = 16000
min_seconds= 1
max_seconds = 30
speech_seconds = 0.3
silence_seconds = 0.5
before_seconds = 0.2
chunk_size= 960
skip_seconds = 0
audio_source = None
channels =1
def SpeechToText():
recorder = WebRtcVadRecorder(vad_mode=vad_mode,)
recorder.start()
# file directory
wav_sink = 'wavs/'
wav_dir = None
# file name
wav_filename = 'tester'
if wav_sink:
wav_sink_path = Path(wav_sink)
if wav_sink_path.is_dir():
# Directory to write WAV files
wav_dir = wav_sink_path
else:
# Single WAV file to write
wav_sink = open(wav_sink, "wb")
voice_command: typing.Optional[VoiceCommand] = None
audio_source = pa.open(rate=sample_rate,format=pyaudio.paInt16,channels=channels,input=True,frames_per_buffer=chunk_size)
audio_source.start_stream()
print("Ready", file=sys.stderr)
def buffer_to_wav(buffer: bytes) -> bytes:
"""Wraps a buffer of raw audio data in a WAV"""
rate = int(sample_rate)
width = int(2)
channels = int(1)
with io.BytesIO() as wav_buffer:
wav_file: wave.Wave_write = wave.open(wav_buffer, mode="wb")
with wav_file:
wav_file.setframerate(rate)
wav_file.setsampwidth(width)
wav_file.setnchannels(channels)
wav_file.writeframesraw(buffer)
return wav_buffer.getvalue()
try:
chunk = audio_source.read(chunk_size)
while chunk:
# Look for speech/silence
voice_command = recorder.process_chunk(chunk)
if voice_command:
is_timeout = voice_command.result == VoiceCommandResult.FAILURE
# Reset
audio_data = recorder.stop()
if wav_dir:
# Write WAV to directory
wav_path = (wav_dir / time.strftime(wav_filename)).with_suffix(
".wav"
)
wav_bytes = buffer_to_wav(audio_data)
wav_path.write_bytes(wav_bytes)
print(wav_path)
print('file saved')
break
elif wav_sink:
# Write to WAV file
wav_bytes = core.buffer_to_wav(audio_data)
wav_sink.write(wav_bytes)
# Next audio chunk
chunk = audio_source.read(chunk_size)
finally:
try:
audio_source.close_stream()
except Exception:
pass
#execute command
SpeechToText()
I've been tinkering around with pyaudio for a while now, trying to reverse a simple wave file with no success.
In (my) theory I would only have to iterate from end to beginning through the file with every callback of pyaudio (1024 frames) fetch the audio data from the according index in the file, reverse the resulting string and play it.
Here is my code (only pyaudio callback and file handling, the rest is untouched from the example code):
import pyaudio
import wave
import time
import sys
if len(sys.argv) < 2:
print("Plays a wave file.\n\nUsage: %s filename.wav" % sys.argv[0])
sys.exit(-1)
index = 40*1024
wf = wave.open(sys.argv[1], 'rb')
wf.setpos(index)
p = pyaudio.PyAudio()
def callback(in_data, frame_count, time_info, status):
global index
data = wf.readframes(frame_count)
data = data[::-1]
index-=1024
wf.setpos(index)
return (data, pyaudio.paContinue)
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
stream_callback=callback)
stream.start_stream()
while stream.is_active():
time.sleep(0.1)
stream.stop_stream()
stream.close()
wf.close()
p.terminate()
I know this will crash when it reaches the file beginning, but it should play 40 × 1024 frames of reversed audio...
If the file you want reversed is small enough to fit in memory, your best bet would be loading it entirely and reversing the data, then streaming it:
import pyaudio
import wave
wavefile_name = 'wavefile.wav'
wf = wave.open(wavefile_name, 'rb')
p = pyaudio.PyAudio()
stream = p.open(format =
p.get_format_from_width(wf.getsampwidth()),
channels = wf.getnchannels(),
rate = wf.getframerate(),
output = True)
full_data = []
data = wf.readframes(1024)
while data:
full_data.append(data)
data = wf.readframes(1024)
data = ''.join(full_data)[::-1]
for i in range(0, len(data), 1024):
stream.write(data[i:i+1024])
However, if the file is too big to fit in memory, you'll need some way of reading the file backwards and feed into the sound system. That's an entirely different problem, because it involves some low level I/O programming to handle the backward reading.
Edit: after seeing your full code, I don't see any errors. The code runs properly in my computer, only to fail at the ending of the playback. However, you can do two things. First, you can go to the end of the wavefile to play the entire sound backwards, using this:
wf = wave.open(sys.argv[1], 'rb')
index = wf.getnframes() - 1024
wf.setpos(index)
Second, you have to modify the callback so it doesn't fail when the seek head goes beyond the beginning of the file:
def callback(in_data, frame_count, time_info, status):
global index
data = wf.readframes(frame_count)
data = data[::-1]
index-=1024
if index < 0:
return (data, pyaudio.paAbort)
else:
wf.setpos(max(index,0))
return (data, pyaudio.paContinue)
Other than that, it works pretty ok.
I'm having some problems and I cannot seem to get my head around the concept.
What I am trying to do is this:
Have the microphone "listen" for voiced (above a particular threshold) and then start recording to a .wav file until the person has stopped speaking / the signal is no longer there. For example:
begin:
listen() -> nothing is being said
listen() -> nothing is being said
listen() -> VOICED - _BEGIN RECORDING_
listen() -> VOICED - _BEGIN RECORDING_
listen() -> UNVOICED - _END RECORDING_
end
I want to do this also using "threading" so a thread would be created that "listens" to the file constantly, and, another thread will begin when there is voiced data.. But, I cannot for the life of me figure out how I should go about it.. Here is my code so far:
import wave
import sys
import threading
from array import array
from sys import byteorder
try:
import pyaudio
CHECK_PYLIB = True
except ImportError:
CHECK_PYLIB = False
class Audio:
_chunk = 0.0
_format = 0.0
_channels = 0.0
_rate = 0.0
record_for = 0.0
stream = None
p = None
sample_width = None
THRESHOLD = 500
# initial constructor to accept params
def __init__(self, chunk, format, channels, rate):
#### set data-types
self._chunk = chunk
self.format = pyaudio.paInt16,
self.channels = channels
self.rate = rate
self.p = pyaudio.PyAudio();
def open(self):
# print "opened"
self.stream = self.p.open(format=pyaudio.paInt16,
channels=2,
rate=44100,
input=True,
frames_per_buffer=1024);
return True
def record(self):
# create a new instance/thread to record the sound
threading.Thread(target=self.listen).start();
def is_silence(snd_data):
return max(snd_data) < THRESHOLD
def listen(self):
r = array('h')
while True:
snd_data = array('h', self.stream.read(self._chunk))
if byteorder == 'big':
snd_data.byteswap()
r.extend(snd_data)
return sample_width, r
I'm guessing that I could record "5" second blocks, and, then if the block is deemed as "voiced" then it the thread should be started until all the voice data has been captured. However, because at current it's at while True: i don't want to capture all of the audio up until there are voiced commands, so e.g. "no voice", "no voice", "voice", "voice", "no voice", "no voice" i just want the "voice" inside the wav file.. Anyone have any suggestions?
Thank you
EDIT:
import wave
import sys
import time
import threading
from array import array
from sys import byteorder
from Queue import Queue, Full
import pyaudio
CHUNK_SIZE = 1024
MIN_VOLUME = 500
BUF_MAX_SIZE = 1024 * 10
process_g = 0
def main():
stopped = threading.Event()
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK_SIZE)))
listen_t = threading.Thread(target=listen, args=(stopped, q))
listen_t.start()
process_g = threading.Thread(target=process, args=(stopped, q))
process_g.start()
try:
while True:
listen_t.join(0.1)
process_g.join(0.1)
except KeyboardInterrupt:
stopped.set()
listen_t.join()
process_g.join()
def process(stopped, q):
while True:
if stopped.wait(timeout = 0):
break
print "I'm processing.."
time.sleep(300)
def listen(stopped, q):
stream = pyaudio.PyAudio().open(
format = pyaudio.paInt16,
channels = 2,
rate = 44100,
input = True,
frames_per_buffer = 1024
)
while True:
if stopped and stopped.wait(timeout=0):
break
try:
print process_g
for i in range(0, int(44100 / 1024 * 5)):
data_chunk = array('h', stream.read(CHUNK_SIZE))
vol = max(data_chunk)
if(vol >= MIN_VOLUME):
print "WORDS.."
else:
print "Nothing.."
except Full:
pass
if __name__ == '__main__':
main()
Now, after every 5 seconds, I need the "process" function to execute, and then process the data (time.delay(10) whilst it does this and then start the recording back up..
Having spent some time on it, I've come up with the following code that seems to be doing what you need, except writing to file:
import threading
from array import array
from Queue import Queue, Full
import pyaudio
CHUNK_SIZE = 1024
MIN_VOLUME = 500
# if the recording thread can't consume fast enough, the listener will start discarding
BUF_MAX_SIZE = CHUNK_SIZE * 10
def main():
stopped = threading.Event()
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK_SIZE)))
listen_t = threading.Thread(target=listen, args=(stopped, q))
listen_t.start()
record_t = threading.Thread(target=record, args=(stopped, q))
record_t.start()
try:
while True:
listen_t.join(0.1)
record_t.join(0.1)
except KeyboardInterrupt:
stopped.set()
listen_t.join()
record_t.join()
def record(stopped, q):
while True:
if stopped.wait(timeout=0):
break
chunk = q.get()
vol = max(chunk)
if vol >= MIN_VOLUME:
# TODO: write to file
print "O",
else:
print "-",
def listen(stopped, q):
stream = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=2,
rate=44100,
input=True,
frames_per_buffer=1024,
)
while True:
if stopped.wait(timeout=0):
break
try:
q.put(array('h', stream.read(CHUNK_SIZE)))
except Full:
pass # discard
if __name__ == '__main__':
main()
Look here:
https://github.com/jeysonmc/python-google-speech-scripts/blob/master/stt_google.py
It even converts Wav to flac and sends it to the google Speech api , just delete the stt_google_wav function if you dont need it ;)