import pyaudio
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from threading import Thread
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
try:
from Queue import Queue, Full
except ImportError:
from queue import Queue, Full
###############################################
#### Initalize queue to store the recordings ##
###############################################
CHUNK = 1024
# Note: It will discard if the websocket client can't consumme fast enough
# So, increase the max size as per your choice
BUF_MAX_SIZE = CHUNK * 10
# Buffer to store audio
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
# Create an instance of AudioSource
audio_source = AudioSource(q, True, True)
###############################################
#### Prepare Speech to Text Service ########
###############################################
# initialize speech to text service
authenticator = IAMAuthenticator('apikey')
speech_to_text = SpeechToTextV1(authenticator=authenticator)
#speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/62a2f19f-959f-4c3c-a276-27ab0e458341/v1/recognize')
speech_to_text.set_service_url('https://stream.watsonplatform.net/speech-to-text/api')
# define callback for the speech to text service
class MyRecognizeCallback(RecognizeCallback):
def __init__(self):
RecognizeCallback.__init__(self)
def on_transcription(self, transcript):
print(transcript)
def on_connected(self):
print('Connection was successful')
def on_error(self, error):
print('Error received: {}'.format(error))
def on_inactivity_timeout(self, error):
print('Inactivity timeout: {}'.format(error))
def on_listening(self):
print('Service is listening')
def on_hypothesis(self, hypothesis):
print(hypothesis)
def on_data(self, data):
print(data)
def on_close(self):
print("Connection closed")
# this function will initiate the recognize service and pass in the AudioSource
def recognize_using_weboscket(*args):
mycallback = MyRecognizeCallback()
speech_to_text.recognize_using_websocket(audio=audio_source,
content_type='audio/l16; rate=44100',
recognize_callback=mycallback,
interim_results=True)
###############################################
#### Prepare the for recording using Pyaudio ##
###############################################
# Variables for recording the speech
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
# define callback for pyaudio to store the recording in queue
def pyaudio_callback(in_data, frame_count, time_info, status):
try:
q.put(in_data)
except Full:
pass # discard
return (None, pyaudio.paContinue)
# instantiate pyaudio
audio = pyaudio.PyAudio()
# open stream using callback
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=pyaudio_callback,
start=False
)
#########################################################################
#### Start the recording and start service to recognize the stream ######
#########################################################################
print("Enter CTRL+C to end recording...")
stream.start_stream()
try:
recognize_thread = Thread(target=recognize_using_weboscket, args=())
recognize_thread.start()
while True:
pass
except KeyboardInterrupt:
# stop recording
stream.stop_stream()
stream.close()
audio.terminate()
audio_source.completed_recording()
This is the code for IBM's Speech-To-Text service using a mic as input. May I know what the output of the program is? This is the output im getting:
Enter CTRL+C to end recording...
Connection was successful
Service is listening
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\websocket\_app.py", line 320, in _callback
callback(self, *args)
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\ibm_watson\websocket\recognize_listener.py", line 199, in on_data
hypothesis = json_object['results'][0]['alternatives'][0][
Connection closed
It suddenly works when I tested with my wireless headset mic. Not sure why though as both devices are functioning well. The output is the transcript in the console.
This is happening to me too and I think the cause of your problem is the audio that you sent to the websocket was probably difficult to recognize, so the websocket's response was none / null, and when the hypothesis function tries to get the answer this the error occurs because the result does not exist.
The output on hypotesis function (def hypotesis ) will be a string with the result of transcript audio file and on data function (def data) will be a json like that:
{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'hello ', 'confidence': 0.66}], 'keywords_result': {}}]}
Related
import sys
import sounddevice as sd
import numpy as np
import speech_recognition as sr
# Define the sample rate and chunk size
sample_rate = 44100
chunk_size = 2048
# Initialize recognizer class (for recognizing the speech)
r = sr.Recognizer()
# Define the callback function for the sounddevice stream
def audio_callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
audio_data = np.frombuffer(indata, dtype=np.int16)
try:
text = r.recognize_google(sr.AudioData(audio_data, sample_rate, sample_width=2))
print("Recognized text: ", text)
except Exception as e:
print("Error: ", e)
# Start the stream
stream = sd.InputStream(callback=audio_callback, channels=1, blocksize=chunk_size,
samplerate=sample_rate)
with stream:
while True:
pass
audio_data = stream.read(chunk_size,Exception = False)
Here the error is input overflow. What should I do?
I tried to minimize flow of data but was unsuccessful. I want the code to take input in proper quantity and deliver the output. I am a beginner at Python so any help will be great.
I have a function that connects to my microphone and recognizes speech as a string with a phrase.
It looks like this:
import json, pyaudio
from Analysis import Voice_tag, nlp
from vosk import Model, KaldiRecognizer
class VoiseRecorder():
def __init__(self):
model = Model("model1")
self.rec = KaldiRecognizer(model, 128000)
p = pyaudio.PyAudio()
self.stream = p.open(format=pyaudio.paInt16,
channels=1,
rate = 128000,
input=True,
frames_per_buffer=64000)
self.stream.start_stream()
self.Voise
for text in self.CamVoise():
Voice_tag(text)
print(" ")
def Voise(self):
print('F on')
while True:
data = self.stream.read(32000, exception_on_overflow=False)
if (self.rec.AcceptWaveform(data)) and (len(data)>0):
out = json.loads(self.rec.Result())
if out['text']:
yield out['text']
VoiseRecord = VoiseRecorder()
Can I somehow transfer the sound from the rtsp stream to the processing using this method ?
I am new to Google API and web services. I only tried GoogleTransateAPI once but that one works fine. Now, I want to use Google Media Translation API to translate voice input. I followed the tutorial from https://cloud.google.com/translate/media/docs/streaming.
However, I cannot make it work. There is no error at the run time so I don't know where to look at. Could you please help me identify the problem?
# [START media_translation_translate_from_mic]
from __future__ import division
import itertools
from google.cloud import mediatranslation as media
import pyaudio
from six.moves import queue
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/Me/GoogleMT/TranslationAPI/MediaKey.json"
# Audio recording parametersss
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
SpeechEventType = media.StreamingTranslateSpeechResponse.SpeechEventType
class MicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
self.closed = False
return self
def __exit__(self, type=None, value=None, traceback=None):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def exit(self):
self.__exit__()
def generator(self):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b''.join(data)
def listen_print_loop(responses):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
"""
translation = ''
source = ''
for response in responses:
# Once the transcription settles, the response contains the
# END_OF_SINGLE_UTTERANCE event.
if (response.speech_event_type ==
SpeechEventType.END_OF_SINGLE_UTTERANCE):
print(u'\nFinal translation: {0}'.format(translation))
print(u'Final recognition result: {0}'.format(source))
return 0
result = response.result
translation = result.text_translation_result.translation
source = result.recognition_result
print(u'\nPartial translation: {0}'.format(translation))
print(u'Partial recognition result: {0}'.format(source))
def do_translation_loop():
print('Begin speaking...')
client = media.SpeechTranslationServiceClient()
speech_config = media.TranslateSpeechConfig(
audio_encoding='linear16',
source_language_code='en-US',
target_language_code='ja')
config = media.StreamingTranslateSpeechConfig(
audio_config=speech_config, single_utterance=True)
# The first request contains the configuration.
# Note that audio_content is explicitly set to None.
first_request = media.StreamingTranslateSpeechRequest(
streaming_config=config, audio_content=None)
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
mic_requests = (media.StreamingTranslateSpeechRequest(
audio_content=content,
streaming_config=config)
for content in audio_generator)
requests = itertools.chain(iter([first_request]), mic_requests)
responses = client.streaming_translate_speech(requests)
# Print the translation responses as they arrive
result = listen_print_loop(responses)
if result == 0:
stream.exit()
def main():
while True:
print()
option = input('Press any key to translate or \'q\' to quit: ')
if option.lower() == 'q':
break
do_translation_loop()
if __name__ == '__main__':
main()
# [END media_translation_translate_from_mic]
The result is like this. No translation nor recognition result.
Result screenshot
I was not sure if the problem is with my mic so I tried a similar example code from another Google tutorial to translate an audio file. The result is the same, no recognition result nor translation.
Did I miss something?
Thank you very much.
I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here.
This is the code I have came up with but it gets the text after I finish recording:
import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
data_feed = b''.join(frames)
speech_to_text = SpeechToTextV1(
username='secret',
password='secret too',
x_watson_learning_opt_out=False
)
result = speech_to_text.recognize(data_feed,
content_type="audio/l16;rate=44100;channels=2",
word_confidence=True,
max_alternatives=4,
word_alternatives_threshold=0.5,
model="en-US_BroadbandModel",
continuous=True)
j = json.dumps(result, indent=2)
print(j)
I went ahead and created a program from scratch to connect to the Watson server using websockets. It still isn't doing exactly what I expect but it is very close.
The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.
import asyncio
import websockets
import json
import requests
import pyaudio
import time
# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
p = pyaudio.PyAudio()
# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# BlueMix app credentials
username = "" # Your Bluemix App username
password = "" # Your Bluemix App password
# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}
# Params to use for Watson API
params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": True
}
# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
output=True,
frames_per_buffer=CHUNK)
async def send_audio(ws):
# Starts recording of microphone
print("* READY *")
start = time.time()
while True:
try:
print(".")
data = stream.read(CHUNK)
await ws.send(data)
if time.time() - start > 20: # Records for n seconds
await ws.send(json.dumps({'action': 'stop'}))
return False
except Exception as e:
print(e)
return False
# Stop the stream and terminate the recording
stream.stop_stream()
stream.close()
p.terminate()
async def speech_to_text():
async with websockets.connect(url, extra_headers=token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(send_audio(conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(transcript)
#print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
#conn.close()
#return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()
So all I want now is to get the transcript while I am recording through the microphone.
I would like to send frames from piCam via websocket in base64 format.
I have the following simple code:
import websocket
import time
import picamera
import io
import base64
import StringIO
class MyClass:
ws = ''
picam = ''
stream = ''
def __init__(self):
self.init()
def on_message(self,ws , message):
print ws + "ok"
print message
def on_error(self, ws, error):
print error
def on_close(self, ws):
print "down"
exit()
def on_open(self, ws):
print "opening connection"
ws.send("Hello.")
self.main()
def main(self):
print "main"
output = StringIO.StringIO()
while True:
output.seek(0)
self.picam.capture(output, format="jpeg")
encoded_string = base64.b64encode(output.getvalue())
self.ws.send("{\"Image\":\""+encoded_string+"\"}")
time.sleep(0.2)
output.flush()
def init(self):
print "init"
websocket.enableTrace(True)
self.picam = picamera.PiCamera()
self.picam.resolution = (640, 480)
self.stream = io.BytesIO()
self.picam.start_preview()
self.ws = websocket.WebSocketApp("ws://xxxxx.",
on_message = self.on_message,
on_error = self.on_error,
on_close = self.on_close,
on_open= self.on_open)
self.ws.run_forever()
After starting it sends one Image in ~1-2 sec.
When I try to place into class var base64 image string and send it, this sends every ~0.2 sec.
When I try to just capture the image from piCam without send websocket, it is ok to, captures every ~0.2 sec.
I do not understand why the combination works so slowly?
try using the use_video_port option of the capture method:
self.picam.capture(output, format="jpeg", use_video_port=True)