I am trying to write a Python script that will record 5-second segments of speech on a loop for as long as the user is speaking, and will stop after three wave files of pure silence. How would I go about this?
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
frames_per_buffer = CHUNK)
print("* recording")
frames = []
for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
I think you can use https://github.com/rhasspy/rhasspy-silence library
Here is a little code for u (not mine, but it works.):
from rhasspysilence import WebRtcVadRecorder,VoiceCommand, VoiceCommandResult
import threading
import dataclasses
import typing
from queue import Queue
import json
import io
import os
from pathlib import Path
import shlex
import time
import wave
import sys
import subprocess
import pyaudio
pa =pyaudio.PyAudio()
#you can change the options (these are default settings)
vad_mode = 3
sample_rate = 16000
min_seconds= 1
max_seconds = 30
speech_seconds = 0.3
silence_seconds = 0.5
before_seconds = 0.2
chunk_size= 960
skip_seconds = 0
audio_source = None
channels =1
def SpeechToText():
recorder = WebRtcVadRecorder(vad_mode=vad_mode,)
recorder.start()
# file directory
wav_sink = 'wavs/'
wav_dir = None
# file name
wav_filename = 'tester'
if wav_sink:
wav_sink_path = Path(wav_sink)
if wav_sink_path.is_dir():
# Directory to write WAV files
wav_dir = wav_sink_path
else:
# Single WAV file to write
wav_sink = open(wav_sink, "wb")
voice_command: typing.Optional[VoiceCommand] = None
audio_source = pa.open(rate=sample_rate,format=pyaudio.paInt16,channels=channels,input=True,frames_per_buffer=chunk_size)
audio_source.start_stream()
print("Ready", file=sys.stderr)
def buffer_to_wav(buffer: bytes) -> bytes:
"""Wraps a buffer of raw audio data in a WAV"""
rate = int(sample_rate)
width = int(2)
channels = int(1)
with io.BytesIO() as wav_buffer:
wav_file: wave.Wave_write = wave.open(wav_buffer, mode="wb")
with wav_file:
wav_file.setframerate(rate)
wav_file.setsampwidth(width)
wav_file.setnchannels(channels)
wav_file.writeframesraw(buffer)
return wav_buffer.getvalue()
try:
chunk = audio_source.read(chunk_size)
while chunk:
# Look for speech/silence
voice_command = recorder.process_chunk(chunk)
if voice_command:
is_timeout = voice_command.result == VoiceCommandResult.FAILURE
# Reset
audio_data = recorder.stop()
if wav_dir:
# Write WAV to directory
wav_path = (wav_dir / time.strftime(wav_filename)).with_suffix(
".wav"
)
wav_bytes = buffer_to_wav(audio_data)
wav_path.write_bytes(wav_bytes)
print(wav_path)
print('file saved')
break
elif wav_sink:
# Write to WAV file
wav_bytes = core.buffer_to_wav(audio_data)
wav_sink.write(wav_bytes)
# Next audio chunk
chunk = audio_source.read(chunk_size)
finally:
try:
audio_source.close_stream()
except Exception:
pass
#execute command
SpeechToText()
Related
I wanna play wav file using PyAudio with callback mode. I could play wav file once but then stops. For example, if I want to play a wav file (trumpet sound, etc.) every five seconds, how should I implement this? Also, lemme know how to play array (wav data) in callback mode, just in case.
The code that I've implemented so far in the following...
import pyaudio
import numpy as np
import wave
import time
FORMAT = pyaudio.paInt16
CHANNEL = 2
CHUNK = 1024
RATE = 44100
flag = True
cur_dt = 0
# reading wav file here
wf = wave.open('test_3.wav', 'rb')
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# define callback (2)
def callback(in_data, frame_count, time_info, status):
global flag, cur_dt, wf
if flag:
wf = wave.open('test_3.wav', 'rb')
flag = False
data = np.zeros((frame_count,), dtype=np.int16)
data = wf.readframes(frame_count)
# convert bytes to int
data_int = np.frombuffer(data, dtype=np.int16)\
# convert int to float
data_float = data_int.astype(np.float32) / 32768.0
# # convert float to int
# data_int_again = (data_float*32768.0).astype(np.int16, order='C')
# ur processing...
f1 = np.fft.fft(data_float)
f2 = np.fft.ifft(f1)
f3 = (f2*32768.0).astype(np.int16)
# make it sound!
data = f3
# cur_dt += 1
# print(cur_dt)
return (data.tobytes(), pyaudio.paContinue)
# open stream using callback (3)
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
stream_callback=callback)
# print(p.get_format_from_width(wf.getsampwidth())) # paInt16
# print(wf.getsampwidth()) # 3
# print(wf.getnchannels()) # 2
# print(wf.getframerate()) # 44100
stream.start_stream()
# wait for stream to finish (5)
while stream.is_active():
time.sleep(5)
flag = True
# stop stream (6)
stream.stop_stream()
stream.close()
wf.close()
# close PyAudio (7)
p.terminate()
I have a normal stream in Pyaudio that plays a wav file but I can not find any way to change the volume of the stream. I tried multiplying the data by some number but that did not work.
import pyaudio
import wave
audio = pyaudio.PyAudio()
chunk = 1024
af = wave.open("filename.wav", 'rb')
pa = pyaudio.PyAudio()
stream = pa.open(format =
pa.get_format_from_width(af.getsampwidth()),
channels = af.getnchannels(),
rate = af.getframerate(),
output_device_index=9,
output = True)
rd_data = af.readframes(chunk)
while rd_data != '':
#do some majic to change the volume
stream.write(rd_data)
rd_data = af.readframes(chunk)
You can use numpy for this:
import numpy
def audio_datalist_set_volume(datalist, volume):
""" Change value of list of audio chunks """
sound_level = (volume / 100.)
for i in range(len(datalist)):
chunk = numpy.fromstring(datalist[i], numpy.int16)
chunk = chunk * sound_level
datalist[i] = chunk.astype(numpy.int16)
With the Python script shown below I try to play a wav file from the internet but I'm getting the error message OSError: [Errno 22] Invalid argument: 'https://file-examples-com.github.io/uploads/2017/11/file_example_WAV_1MG.wav'.
How can I play a wav file from the internet?
import pyaudio
import wave
chunk = 1024
f = wave.open("https://file-examples-com.github.io/uploads/2017/11/file_example_WAV_1MG.wav","rb")
p = pyaudio.PyAudio()
stream = p.open(format = p.get_format_from_width(f.getsampwidth()),
channels = f.getnchannels(),
rate = f.getframerate(),
output = True)
data = f.readframes(chunk)
while data:
stream.write(data)
data = f.readframes(chunk)
stream.stop_stream()
stream.close()
p.terminate()
You can also get the content of website, store it in a variable, and play it. There is no need to store it on the disk for a short file like this. Here is an example of how to do this:
import logging
import requests
import simpleaudio
sample_rate = 8000
num_channels = 2
bytes_per_sample = 2
total = sample_rate * num_channels * bytes_per_sample
logging.basicConfig(level=logging.INFO)
audio_url = "https://file-examples-com.github.io/uploads/2017/11/file_example_WAV_1MG.wav"
logging.info(f"Downloading audio file from: {audio_url}")
content = requests.get(audio_url).content
# Just to ensure that the file does not have extra bytes
blocks = len(content) // total
content = content[:total * blocks]
wave = simpleaudio.WaveObject(audio_data=content,
sample_rate=sample_rate,
num_channels=num_channels,
bytes_per_sample=bytes_per_sample)
control = wave.play()
control.wait_done()
I'm demonstrating what #larsks suggests.
import requests
with open(audio_file, 'wb') as a:
resp = requests.get("https://file-examples-com.github.io/uploads/2017/11/file_example_WAV_1MG.wav")
if resp.status_code == 200:
a.write(resp.content)
print('downloaded')
else:
print(resp.reason)
exit(1)
f = wave.open(audio_file, "rb")
# the remaining lines are the same
And I also suggest another great python library python-mpv which is based on mpv, this library can handle much more codecs and also online streaming play.
I made a program that records files and saves them to a file directory and it does save them properly. However, when I try and open it and see what was recorded I see that it has no stored audio data. I am not sure what I am doing wrong. Please take a look and let me know.
from playsound import playsound
from random import randrange
import pyttsx3
from datetime import datetime
import pyaudio
import speech_recognition as sr
import requests
import wave
import numpy as np
import sounddevice as sd
import math
import time
import os
import sys
import sounddevice as sd
from scipy.io.wavfile import write
import struct
def voiceDetection():
SoundThreshHold = 50
TimeoutLength = 5
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2 #Basicly audio output
RATE = 16000 #Rate at which you sample
f_name_directory = r"C:\Users\x\OneDrive\Desktop\Record"
def rms(data):
count = len(data)/2
format = "%dh"%(count)
shorts = struct.unpack( format, data )
sum_squares = 0.0
for sample in shorts:
n = sample * (1.0/32768)
sum_squares += n*n
return math.sqrt( sum_squares / count)*1000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=chunk)
currentTime = time.time()
end = time.time() + TimeoutLength
frames = []
while currentTime < end:
currentTime = time.time()
data = stream.read(chunk)
if rms(data) >= SoundThreshHold:
#print(rms(data))
end = time.time() + TimeoutLength
frames.append(data)
n_files = len(os.listdir(f_name_directory))
filename = os.path.join(f_name_directory,'{}.wav'.format(n_files))
wf = wave.open(filename,'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
print('Written to file: {}'.format(filename))
stream.stop_stream()
stream.close()
p.terminate()
voiceDetection()
The current code writes a separate WAV file once per chunk, and always with the same name, so the file overwrites any WAV written for a previous chunk. You probably intend to call wave.open once before the loop and wf.close after the loop, so that one WAV is written for the whole the session.
Edit: Interspersing file IO during the audio recording might be too much overhead to record properly without dropping samples. You could try instead buffering up all the samples in memory and then writing the WAV all at once afterward. On the pyaudio homepage, there is a "record" example to record several seconds of audio and write it as a WAV file:
"""PyAudio example: Record a few seconds of audio and save to a WAVE file."""
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
As a general note, beware that unfortunately the (standard CPython) Python interpreter has limited ability to execute threads truly simultaneously, which makes Python a poor language for real time audio applications (see also Does python support multiprocessor/multicore programming?). Depending on your project goals, you might want to switch to C++ and use the portaudio C library (on which pyaudio is based).
I'm trying to modify my first Python program. I'm trying to use this repository to do some rudimentary text-to-speech. It does fine, but I want to improve it.
From the looks of it, there is a 0.145 second delay between samples played. Not all the samples of my voice will be 0.145 seconds, however, and I want to have each sample play one after the other with no delays or skips.
import re
import wave
import pyaudio
import _thread
import time
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict:str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ',2)
self._l[key] = re.findall(r"[A-Z]+",val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+",str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
delay = 0.0
for pron in list_pron:
_thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
delay += 0.145
def _play_audio(sound, delay):
try:
time.sleep(delay)
wf = wave.open("sounds/"+sound+".wav", 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
except:
pass
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I've tried getting rid of the threading and delay, but there is some delay still between samples. I'm thinking that I should, instead of incrementing delay by 0.145, increment it by the length of the sample in seconds, but I've looked at the pyaudio documentation, and I have no idea how to do that.
Can someone help?
Here is a modified code that plays wav files continuously.
import re
import wave
import pyaudio
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict='cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict: str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ', 2)
self._l[key] = re.findall(r"[A-Z]+", val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+", str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
# pyaudio set up.
# This open method assume all wave files have the same format.
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(2),
channels=2,
rate=44100,
output=True,
frames_per_buffer=self.CHUNK)
# play each wav file contineuously
for pron in list_pron:
with wave.open("sounds/"+pron+".wav", 'rb') as wf:
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))