Continuous Real Time Speech to Text with Watson for Python

Continuous Real Time Speech to Text with Watson for Python - python

I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here.
This is the code I have came up with but it gets the text after I finish recording:
import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
data_feed = b''.join(frames)
speech_to_text = SpeechToTextV1(
username='secret',
password='secret too',
x_watson_learning_opt_out=False
)
result = speech_to_text.recognize(data_feed,
content_type="audio/l16;rate=44100;channels=2",
word_confidence=True,
max_alternatives=4,
word_alternatives_threshold=0.5,
model="en-US_BroadbandModel",
continuous=True)
j = json.dumps(result, indent=2)
print(j)

I went ahead and created a program from scratch to connect to the Watson server using websockets. It still isn't doing exactly what I expect but it is very close.
The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.
import asyncio
import websockets
import json
import requests
import pyaudio
import time
# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
p = pyaudio.PyAudio()
# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# BlueMix app credentials
username = "" # Your Bluemix App username
password = "" # Your Bluemix App password
# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}
# Params to use for Watson API
params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": True
}
# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=CHUNK)
async def send_audio(ws):
# Starts recording of microphone
print("* READY *")
start = time.time()
while True:
try:
print(".")
data = stream.read(CHUNK)
await ws.send(data)
if time.time() - start > 20: # Records for n seconds
await ws.send(json.dumps({'action': 'stop'}))
return False
except Exception as e:
print(e)
return False
# Stop the stream and terminate the recording
stream.stop_stream()
stream.close()
p.terminate()
async def speech_to_text():
async with websockets.connect(url, extra_headers=token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(send_audio(conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(transcript)
#print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
#conn.close()
#return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()
So all I want now is to get the transcript while I am recording through the microphone.

Related

Decrease audio quality

I'm trying to build an 'old radio' with raspberry pi 4 and python 3. I have a button which records my sound as long as I'm holding the button. When I release it, it replays the recorded sound and also plays an other wav file as background noise. My problem is that my voice is too clear and doesn't sound like an 'old radio' with noises and crackling.
I have tried to change CHUNK and nothing happened. Changing the RATE only makes my sound deeper/higher.
I'm using an USB converter for the microphone and the headset aswell.
Basically, I want to add 'noise' or any effect to my voice that makes it sound like it's from an old radio.
import time
import pyaudio
import wave
import sched
from pygame import mixer
import audiosegment
import numpy as np
from gpiozero import Button
CHUNK = 8192
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 45000
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
button = Button(2)
p = pyaudio.PyAudio()
frames = []
def callback(in_data, frame_count, time_info, status):
frames.append(in_data)
return in_data, pyaudio.paContinue
started = False
stream = None
playing = None;
def recorder():
global started, p, stream, frames, button, playing
if button.is_pressed and not started and not playing:
# Start the recording
try:
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=callback)
print("Stream active:", stream.is_active())
started = True
print("start Stream")
except:
raise
elif not button.is_pressed and started:
try:
started = False
print("Stop recording")
stream.stop_stream()
stream.close()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
mixer.init()
mixer.Channel(0).set_volume(0.1)
tada = mixer.Sound('old_radio.wav')
channel = tada.play()
s = audiosegment.from_file(WAVE_OUTPUT_FILENAME)
main = mixer.Sound(WAVE_OUTPUT_FILENAME)
channel1 = main.play()
if 'channel1' in locals() and channel1 is not None:
while channel1.get_busy():
playing = True
channel.unpause()
channel.pause()
channel.stop()
channel1.stop()
playing = False
frames = []
except:
raise
# Reschedule the recorder function in 100 ms.
task.enter(0.1, 1, recorder, ())
print("Press and hold the button to begin recording")
print("Release the button to end recording")
task = sched.scheduler(time.time, time.sleep)
task.enter(0.1, 1, recorder, ())
task.run()

Output of IBM Speech-To-Text

import pyaudio
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from threading import Thread
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
try:
from Queue import Queue, Full
except ImportError:
from queue import Queue, Full
###############################################
#### Initalize queue to store the recordings ##
###############################################
CHUNK = 1024
# Note: It will discard if the websocket client can't consumme fast enough
# So, increase the max size as per your choice
BUF_MAX_SIZE = CHUNK * 10
# Buffer to store audio
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
# Create an instance of AudioSource
audio_source = AudioSource(q, True, True)
###############################################
#### Prepare Speech to Text Service ########
###############################################
# initialize speech to text service
authenticator = IAMAuthenticator('apikey')
speech_to_text = SpeechToTextV1(authenticator=authenticator)
#speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/62a2f19f-959f-4c3c-a276-27ab0e458341/v1/recognize')
speech_to_text.set_service_url('https://stream.watsonplatform.net/speech-to-text/api')
# define callback for the speech to text service
class MyRecognizeCallback(RecognizeCallback):
def __init__(self):
RecognizeCallback.__init__(self)
def on_transcription(self, transcript):
print(transcript)
def on_connected(self):
print('Connection was successful')
def on_error(self, error):
print('Error received: {}'.format(error))
def on_inactivity_timeout(self, error):
print('Inactivity timeout: {}'.format(error))
def on_listening(self):
print('Service is listening')
def on_hypothesis(self, hypothesis):
print(hypothesis)
def on_data(self, data):
print(data)
def on_close(self):
print("Connection closed")
# this function will initiate the recognize service and pass in the AudioSource
def recognize_using_weboscket(*args):
mycallback = MyRecognizeCallback()
speech_to_text.recognize_using_websocket(audio=audio_source,
content_type='audio/l16; rate=44100',
recognize_callback=mycallback,
interim_results=True)
###############################################
#### Prepare the for recording using Pyaudio ##
###############################################
# Variables for recording the speech
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
# define callback for pyaudio to store the recording in queue
def pyaudio_callback(in_data, frame_count, time_info, status):
try:
q.put(in_data)
except Full:
pass # discard
return (None, pyaudio.paContinue)
# instantiate pyaudio
audio = pyaudio.PyAudio()
# open stream using callback
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=pyaudio_callback,
start=False
)
#########################################################################
#### Start the recording and start service to recognize the stream ######
#########################################################################
print("Enter CTRL+C to end recording...")
stream.start_stream()
try:
recognize_thread = Thread(target=recognize_using_weboscket, args=())
recognize_thread.start()
while True:
pass
except KeyboardInterrupt:
# stop recording
stream.stop_stream()
stream.close()
audio.terminate()
audio_source.completed_recording()
This is the code for IBM's Speech-To-Text service using a mic as input. May I know what the output of the program is? This is the output im getting:
Enter CTRL+C to end recording...
Connection was successful
Service is listening
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\websocket\_app.py", line 320, in _callback
callback(self, *args)
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\ibm_watson\websocket\recognize_listener.py", line 199, in on_data
hypothesis = json_object['results'][0]['alternatives'][0][
Connection closed

It suddenly works when I tested with my wireless headset mic. Not sure why though as both devices are functioning well. The output is the transcript in the console.

This is happening to me too and I think the cause of your problem is the audio that you sent to the websocket was probably difficult to recognize, so the websocket's response was none / null, and when the hypothesis function tries to get the answer this the error occurs because the result does not exist.
The output on hypotesis function (def hypotesis ) will be a string with the result of transcript audio file and on data function (def data) will be a json like that:
{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'hello ', 'confidence': 0.66}], 'keywords_result': {}}]}

voice recording using pyaudio

i am trying to record voice using python.
i tried to use the pyaudio module it saved a wav file on my computer but recorded a static voice.
any suggestions?
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "voice.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

First, make sure your microphone is actually connected, on and not muted.
You are not providing a device index when opening the stream. This means that you will get the device that PyAudio considers the default. Which might not be your microphone.
Use the get_device_count and get_device_info_by_index methods of the PyAudio object in an interactive Python session. Print the dictionaries that get_device_info_by_index returns to determine which device index represents your microphone, and provide that index number as the input_device_index parameter when opening the stream.

below code will list indexes of the available recording devices,then user can give a specific index as the input,code will start recording via given recording device index.
import pyaudio
import wave
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "recordedFile.wav"
device_index = 2
audio = pyaudio.PyAudio()
print("----------------------record device list---------------------")
info = audio.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
print("Input Device id ", i, " - ", audio.get_device_info_by_host_api_device_index(0, i).get('name'))
print("-------------------------------------------------------------")
index = int(input())
print("recording via index "+str(index))
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,input_device_index = index,
frames_per_buffer=CHUNK)
print ("recording started")
Recordframes = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
Recordframes.append(data)
print ("recording stopped")
stream.stop_stream()
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(Recordframes))
waveFile.close()

Your code works in my environment: Win7 and Python3.4 - I get my voice recorded using my Laptops's microphone.
Maybe your microphone's recording level is set too low. Or is it muted or disabled?

Make sure your Microphone is connected to the computer. It can be identified using below code.
import speech_recognition as sr
for index, name in enumerate(sr.Microphone.list_microphone_names()):
print("Microphone with name \"{1}\" found for microphone(device_index{0})".format(index, name))

Tkinter timer while recording

Same as in topic, i want to create timer while the recording function would run. I have tried a lot of ideas but all of them resulted as an error or just broke up program, maybe you have some ideas?
class rec(object):
def __init__(self):
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
CHUNK = 1024
WAVE_OUTPUT_FILENAME = str(e2.get() + '.wav')
try:
RECORD_SECONDS = int(e1.get())
except ValueError:
tkMessageBox.showinfo("Error", "Please Enter number of seconds")
i = 0
while os.path.exists(WAVE_OUTPUT_FILENAME):
WAVE_OUTPUT_FILENAME = str(e2.get() + '%d.wav'%i)
i += 1
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
frames = []
print "recording...\n\n"
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
sys.stdout.write('\r%i' % i)
data = stream.read(CHUNK)
frames.append(data)
print "\nfinished recording\n"
# stop Recording
stream.stop_stream()
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
# waveFile.Wave_read.getnframes()
waveFile.close()
tkMessageBox.showinfo("Recorded", "Track:%s"%WAVE_OUTPUT_FILENAME)
I tried to make other function with timer and initiate that function, when i press button, but if i pressed button firstly program ran rec function, and after gone timer function, also i tried to initiate those function by pipe but it does the same, do i have to make that with threads?

Python Voice Communication

Hello I am trying to figure out some code which is suppose to send voice over the network. I am having problems with the audio it sends but its just a series of loud beeps and not the audio I input
After the beeps are finished I get an EOFError
I have spent the last 48 hours trying to figure this out any ideas are greatly appreciated
The relevant code
import pyaudio
import speex
import sys
chunk = 320
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 5
### Server function ###
def server():
### Initialize socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((HOST, PORT))
s.listen(5)
### Start recieve loop
while True:
...
elif cmd == CMD_AUDIO:
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
#voice = cPickle.loads(decrypt_my_message(msg))
voice = cPickle.loads(msg)
print voice
for i in range(len(voice)):
decdata = d.decode(voice[i])#DECODE my data. (YaY)#DECODE my data. (YaY)
stream.write(str(voice), chunk) #Write the data back out to the speakers
stream.stop_stream()
stream.close()
p.terminate()
d.destroy()
if not msg: break
conn.close()
### READ DATA FROM THE MIC ###
def sendAudio():
chunklist = []
init_my_audio = speex.Encoder()
init_my_audio.initialize(speex.SPEEX_MODEID_WB)
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
for i in range(0, 44100 / chunk * RECORD_SECONDS):
try:
data = stream.read(chunk)
except IOError:
pass
encdata = init_my_audio.encode(data)
chunklist.append(encdata)
client(chr(CMD_AUDIO), cPickle.dumps((chunklist), 1))
stream.stop_stream()
stream.close()
p.terminate()
init_my_audio.destroy()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Continuous Real Time Speech to Text with Watson for Python - python

Related

Decrease audio quality

Output of IBM Speech-To-Text

voice recording using pyaudio

Tkinter timer while recording

Python Voice Communication

Categories

Resources