I have a function that connects to my microphone and recognizes speech as a string with a phrase.
It looks like this:
import json, pyaudio
from Analysis import Voice_tag, nlp
from vosk import Model, KaldiRecognizer
class VoiseRecorder():
def __init__(self):
model = Model("model1")
self.rec = KaldiRecognizer(model, 128000)
p = pyaudio.PyAudio()
self.stream = p.open(format=pyaudio.paInt16,
channels=1,
rate = 128000,
input=True,
frames_per_buffer=64000)
self.stream.start_stream()
self.Voise
for text in self.CamVoise():
Voice_tag(text)
print(" ")
def Voise(self):
print('F on')
while True:
data = self.stream.read(32000, exception_on_overflow=False)
if (self.rec.AcceptWaveform(data)) and (len(data)>0):
out = json.loads(self.rec.Result())
if out['text']:
yield out['text']
VoiseRecord = VoiseRecorder()
Can I somehow transfer the sound from the rtsp stream to the processing using this method ?
Related
I'm trying to build an 'old radio' with raspberry pi 4 and python 3. I have a button which records my sound as long as I'm holding the button. When I release it, it replays the recorded sound and also plays an other wav file as background noise. My problem is that my voice is too clear and doesn't sound like an 'old radio' with noises and crackling.
I have tried to change CHUNK and nothing happened. Changing the RATE only makes my sound deeper/higher.
I'm using an USB converter for the microphone and the headset aswell.
Basically, I want to add 'noise' or any effect to my voice that makes it sound like it's from an old radio.
import time
import pyaudio
import wave
import sched
from pygame import mixer
import audiosegment
import numpy as np
from gpiozero import Button
CHUNK = 8192
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 45000
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
button = Button(2)
p = pyaudio.PyAudio()
frames = []
def callback(in_data, frame_count, time_info, status):
frames.append(in_data)
return in_data, pyaudio.paContinue
started = False
stream = None
playing = None;
def recorder():
global started, p, stream, frames, button, playing
if button.is_pressed and not started and not playing:
# Start the recording
try:
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=callback)
print("Stream active:", stream.is_active())
started = True
print("start Stream")
except:
raise
elif not button.is_pressed and started:
try:
started = False
print("Stop recording")
stream.stop_stream()
stream.close()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
mixer.init()
mixer.Channel(0).set_volume(0.1)
tada = mixer.Sound('old_radio.wav')
channel = tada.play()
s = audiosegment.from_file(WAVE_OUTPUT_FILENAME)
main = mixer.Sound(WAVE_OUTPUT_FILENAME)
channel1 = main.play()
if 'channel1' in locals() and channel1 is not None:
while channel1.get_busy():
playing = True
channel.unpause()
channel.pause()
channel.stop()
channel1.stop()
playing = False
frames = []
except:
raise
# Reschedule the recorder function in 100 ms.
task.enter(0.1, 1, recorder, ())
print("Press and hold the button to begin recording")
print("Release the button to end recording")
task = sched.scheduler(time.time, time.sleep)
task.enter(0.1, 1, recorder, ())
task.run()
import pyaudio
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from threading import Thread
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
try:
from Queue import Queue, Full
except ImportError:
from queue import Queue, Full
###############################################
#### Initalize queue to store the recordings ##
###############################################
CHUNK = 1024
# Note: It will discard if the websocket client can't consumme fast enough
# So, increase the max size as per your choice
BUF_MAX_SIZE = CHUNK * 10
# Buffer to store audio
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
# Create an instance of AudioSource
audio_source = AudioSource(q, True, True)
###############################################
#### Prepare Speech to Text Service ########
###############################################
# initialize speech to text service
authenticator = IAMAuthenticator('apikey')
speech_to_text = SpeechToTextV1(authenticator=authenticator)
#speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/62a2f19f-959f-4c3c-a276-27ab0e458341/v1/recognize')
speech_to_text.set_service_url('https://stream.watsonplatform.net/speech-to-text/api')
# define callback for the speech to text service
class MyRecognizeCallback(RecognizeCallback):
def __init__(self):
RecognizeCallback.__init__(self)
def on_transcription(self, transcript):
print(transcript)
def on_connected(self):
print('Connection was successful')
def on_error(self, error):
print('Error received: {}'.format(error))
def on_inactivity_timeout(self, error):
print('Inactivity timeout: {}'.format(error))
def on_listening(self):
print('Service is listening')
def on_hypothesis(self, hypothesis):
print(hypothesis)
def on_data(self, data):
print(data)
def on_close(self):
print("Connection closed")
# this function will initiate the recognize service and pass in the AudioSource
def recognize_using_weboscket(*args):
mycallback = MyRecognizeCallback()
speech_to_text.recognize_using_websocket(audio=audio_source,
content_type='audio/l16; rate=44100',
recognize_callback=mycallback,
interim_results=True)
###############################################
#### Prepare the for recording using Pyaudio ##
###############################################
# Variables for recording the speech
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
# define callback for pyaudio to store the recording in queue
def pyaudio_callback(in_data, frame_count, time_info, status):
try:
q.put(in_data)
except Full:
pass # discard
return (None, pyaudio.paContinue)
# instantiate pyaudio
audio = pyaudio.PyAudio()
# open stream using callback
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=pyaudio_callback,
start=False
)
#########################################################################
#### Start the recording and start service to recognize the stream ######
#########################################################################
print("Enter CTRL+C to end recording...")
stream.start_stream()
try:
recognize_thread = Thread(target=recognize_using_weboscket, args=())
recognize_thread.start()
while True:
pass
except KeyboardInterrupt:
# stop recording
stream.stop_stream()
stream.close()
audio.terminate()
audio_source.completed_recording()
This is the code for IBM's Speech-To-Text service using a mic as input. May I know what the output of the program is? This is the output im getting:
Enter CTRL+C to end recording...
Connection was successful
Service is listening
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\websocket\_app.py", line 320, in _callback
callback(self, *args)
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\ibm_watson\websocket\recognize_listener.py", line 199, in on_data
hypothesis = json_object['results'][0]['alternatives'][0][
Connection closed
It suddenly works when I tested with my wireless headset mic. Not sure why though as both devices are functioning well. The output is the transcript in the console.
This is happening to me too and I think the cause of your problem is the audio that you sent to the websocket was probably difficult to recognize, so the websocket's response was none / null, and when the hypothesis function tries to get the answer this the error occurs because the result does not exist.
The output on hypotesis function (def hypotesis ) will be a string with the result of transcript audio file and on data function (def data) will be a json like that:
{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'hello ', 'confidence': 0.66}], 'keywords_result': {}}]}
I'm trying to modify my first Python program. I'm trying to use this repository to do some rudimentary text-to-speech. It does fine, but I want to improve it.
From the looks of it, there is a 0.145 second delay between samples played. Not all the samples of my voice will be 0.145 seconds, however, and I want to have each sample play one after the other with no delays or skips.
import re
import wave
import pyaudio
import _thread
import time
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict:str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ',2)
self._l[key] = re.findall(r"[A-Z]+",val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+",str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
delay = 0.0
for pron in list_pron:
_thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
delay += 0.145
def _play_audio(sound, delay):
try:
time.sleep(delay)
wf = wave.open("sounds/"+sound+".wav", 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
except:
pass
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I've tried getting rid of the threading and delay, but there is some delay still between samples. I'm thinking that I should, instead of incrementing delay by 0.145, increment it by the length of the sample in seconds, but I've looked at the pyaudio documentation, and I have no idea how to do that.
Can someone help?
Here is a modified code that plays wav files continuously.
import re
import wave
import pyaudio
class TextToSpeech:
CHUNK = 1024
def __init__(self, words_pron_dict='cmudict-0.7b.txt'):
self._l = {}
self._load_words(words_pron_dict)
def _load_words(self, words_pron_dict: str):
with open(words_pron_dict, 'r') as file:
for line in file:
if not line.startswith(';;;'):
key, val = line.split(' ', 2)
self._l[key] = re.findall(r"[A-Z]+", val)
def get_pronunciation(self, str_input):
list_pron = []
for word in re.findall(r"[\w']+", str_input.upper()):
if word in self._l:
list_pron += self._l[word]
print(list_pron)
# pyaudio set up.
# This open method assume all wave files have the same format.
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(2),
channels=2,
rate=44100,
output=True,
frames_per_buffer=self.CHUNK)
# play each wav file contineuously
for pron in list_pron:
with wave.open("sounds/"+pron+".wav", 'rb') as wf:
data = wf.readframes(TextToSpeech.CHUNK)
while data:
stream.write(data)
data = wf.readframes(TextToSpeech.CHUNK)
stream.stop_stream()
stream.close()
p.terminate()
if __name__ == '__main__':
tts = TextToSpeech()
while True:
tts.get_pronunciation(input('Enter a word or phrase: '))
I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here.
This is the code I have came up with but it gets the text after I finish recording:
import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
data_feed = b''.join(frames)
speech_to_text = SpeechToTextV1(
username='secret',
password='secret too',
x_watson_learning_opt_out=False
)
result = speech_to_text.recognize(data_feed,
content_type="audio/l16;rate=44100;channels=2",
word_confidence=True,
max_alternatives=4,
word_alternatives_threshold=0.5,
model="en-US_BroadbandModel",
continuous=True)
j = json.dumps(result, indent=2)
print(j)
I went ahead and created a program from scratch to connect to the Watson server using websockets. It still isn't doing exactly what I expect but it is very close.
The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.
import asyncio
import websockets
import json
import requests
import pyaudio
import time
# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
p = pyaudio.PyAudio()
# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# BlueMix app credentials
username = "" # Your Bluemix App username
password = "" # Your Bluemix App password
# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}
# Params to use for Watson API
params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": True
}
# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=CHUNK)
async def send_audio(ws):
# Starts recording of microphone
print("* READY *")
start = time.time()
while True:
try:
print(".")
data = stream.read(CHUNK)
await ws.send(data)
if time.time() - start > 20: # Records for n seconds
await ws.send(json.dumps({'action': 'stop'}))
return False
except Exception as e:
print(e)
return False
# Stop the stream and terminate the recording
stream.stop_stream()
stream.close()
p.terminate()
async def speech_to_text():
async with websockets.connect(url, extra_headers=token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(send_audio(conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(transcript)
#print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
#conn.close()
#return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()
So all I want now is to get the transcript while I am recording through the microphone.
With PyAudio it is fairly simple to open an input stream comming from a physical sound input device.
I wonder if there is a way of open an existing sound file as a stream, which has the same properties as the sound-device-stream?
Instead of
self.p = pyaudio.PyAudio()
self.inStream = self.p.open(format=pyaudio.paInt16,
channels=1,
rate=self.RATE,
input=True,
frames_per_buffer=self.BUFFERSIZE)
I'd like to do
self.p = pyaudio.PyAudio()
self.inStream = self.p.open('testfile.wav',
input=True,
frames_per_buffer=self.BUFFERSIZE)
Regards,
Torsten
As far as I know you can't use PyAudio to open file directly, however it's possible to combine pyaudio and wave modules in case your app reroutes the processed sound to SoundFlower for example.
import pyaudio, wave
def open_stream(self):
self.p_audio = pyaudio.PyAudio()
self.file_source = wave.open('input.wav', 'rb')
s_width = self.file_source.getsampwidth()
format = self.p_audio.get_format_from_width(s_width)
channels = self.file_source.getnchannels()
rate = self.file_source.getframerate()
self.stream = self.p_audio.open(format=format,
channels=channels,
rate=rate,
frames_per_buffer=1024,
input=False,
output=True,
output_device_index=detect_build_in_output_device_idx(self.p_audio),
stream_callback=self.__recording_callback)
def __recording_callback(self, in_data, frame_count, time_info, status):
# read frames
in_data = self.file_source.readframes(frame_count)
# decode frames
left, right = audio_decode(in_data, self.file_source.getnchannels())
# process frames
left, right = self.core.process(left, right)
# encode back
processed_data = audio_encode((left, right))
return processed_data, pyaudio.paContinue