Azure Speech-To-Text multiple voice recognition - python

I'm trying to transcribe a conversation audio file into text with Azure's SpeechToText. I got it making use of the SKD and did another try with the API (following this instructions https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py) but I also want to split the result text by the different voices. Is it possible?
I know it is available on beta the conversation service, but as my audios are in spanish, I can't use it. Is there a configuration to split result by speakers?
This is the call with SDK:
all_results = []
def speech_recognize_continuous_from_file(file_to_transcript):
"""performs continuous speech recognition with input from an audio file"""
# <SpeechContinuousRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
region=service_region,
speech_recognition_language='es-ES')
audio_config = speechsdk.audio.AudioConfig(filename=file_to_transcribe)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that stops continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
# </SpeechContinuousRecognitionWithFile>
And this with the API:
from __future__ import print_function
from typing import List
import logging
import sys
import requests
import time
import swagger_client as cris_client
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")
SUBSCRIPTION_KEY = subscription_key
HOST_NAME = "westeurope.cris.ai"
PORT = 443
NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"
LOCALE = "es-ES"
RECORDINGS_BLOB_URI = bobl_url
# ADAPTED_ACOUSTIC_ID = None # guid of a custom acoustic model
# ADAPTED_LANGUAGE_ID = None # guid of a custom language model
def transcribe():
logging.info("Starting transcription client...")
# configure API key authorization: subscription_key
configuration = cris_client.Configuration()
configuration.api_key['Ocp-Apim-Subscription-Key'] = SUBSCRIPTION_KEY
# create the client object and authenticate
client = cris_client.ApiClient(configuration)
# create an instance of the transcription api class
transcription_api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)
# get all transcriptions for the subscription
transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()
logging.info("Deleting all existing completed transcriptions.")
# delete all pre-existing completed transcriptions
# if transcriptions are still running or not started, they will not be deleted
for transcription in transcriptions:
transcription_api.delete_transcription(transcription.id)
logging.info("Creating transcriptions.")
# transcription definition using custom models
# transcription_definition = cris_client.TranscriptionDefinition(
# name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
# models=[cris_client.ModelIdentity(ADAPTED_ACOUSTIC_ID), cris_client.ModelIdentity(ADAPTED_LANGUAGE_ID)]
# )
# comment out the previous statement and uncomment the following to use base models for transcription
transcription_definition = cris_client.TranscriptionDefinition(
name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI
)
data, status, headers = transcription_api.create_transcription_with_http_info(transcription_definition)
# extract transcription location from the headers
transcription_location: str = headers["location"]
# get the transcription Id from the location URI
created_transcriptions = list()
created_transcriptions.append(transcription_location.split('/')[-1])
logging.info("Checking status.")
completed, running, not_started = 0, 0, 0
while completed < 1:
# get all transcriptions for the user
transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()
# for each transcription in the list we check the status
for transcription in transcriptions:
if transcription.status == "Failed" or transcription.status == "Succeeded":
# we check to see if it was one of the transcriptions we created from this client
if transcription.id not in created_transcriptions:
continue
completed += 1
if transcription.status == "Succeeded":
results_uri = transcription.results_urls["channel_0"]
results = requests.get(results_uri)
logging.info("Transcription succeeded. Results: ")
logging.info(results.content.decode("utf-8"))
elif transcription.status == "Running":
running += 1
elif transcription.status == "NotStarted":
not_started += 1
logging.info(f"Transcriptions status: {completed} completed, {running} running, {not_started} not started yet")
# wait for 5 seconds
time.sleep(5)
input("Press any key...")
def main():
transcribe()
if __name__ == "__main__":
main()

I also want to split the result text by the different voices.
The transcript received does not contains any notion of speaker. Here you are just calling an endpoint doing transcription, there is no speaker recognition feature inside.
Two things:
If your audio has separate channels for each speaker, then you will have your result (see transcript results_urls channels)
If not, you may use Speaker Recognition API (doc here) to do this identification but:
it needs some training first
you don't have the offsets in the reply, so it will be complicated to map with your transcript result
As you mentioned, the Speech SDK's ConversationTranscriber API (doc here) is currently limited to en-US and zh-CN languages

Contrary to the previous answer, I did get a result where speakers are recognized without any further training or other difficulties. I followed this Github issue:
https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/286
Which lead me to the following change:
transcription_definition = cris_client.TranscriptionDefinition(
name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
properties={"AddDiarization": "True"}
)
Which gives the desired result.

Related

How to process a response from the Twilio REST API

I am developing a program to help treat depression. I do not have a deep understanding of Twilio. I would like to collect the responses to this message:
Sent from your Twilio trial account - What are the positive results or outcomes you have achieved lately?
What are the strengths and resources you have available to you to get even more results and were likely the reason you got the results in the first question.
What are your current priorities? What do you and your team need to be focused on right now?
What are the benefits to all involved-you
your team and all other stakeholders who will be impacted by achieving your priority focus.
How can we (you and/or your team) move close? What action steps are needed?
What am I going to do today?
What am I doing tomorrow ?
What did I do yesterday?
and process them 1-9. The responses will be enumerated by 1-9.
I've contacted Twilio support and I read these docs https://www.twilio.com/docs/sms/tutorials/how-to-receive-and-reply-python.
Here is what I tried:
# Download the helper library from https://www.twilio.com/docs/python/install
import os
from twilio.rest import Client
import logging
import csv
import psycopg2
from flask import Flask, request, redirect
from twilio.twiml.messaging_response import MessagingResponse
app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Set environtment variables
DATABASE = os.environ["DATABASE"]
PASSWORD = os.environ["PASSWORD"]
PORT = os.environ["PORT"]
USER = os.environ["USER"]
HOST = os.environ["HOST"]
# initialization TODO: move into env vars
MY_PHONE_NUMBER = os.environ["MY_PHONE_NUMBER"]
TWILIO_PHONE_NUMBER = os.environ["TWILIO_PHONE_NUMBER"]
TWILIO_ACCOUNT_SID = os.environ["TWILIO_ACCOUNT_SID"]
TWILIO_AUTH_TOKEN = os.environ["TWILIO_AUTH_TOKEN"]
# Configure Twillio
# Set environment variables for your credentials
# Read more at http://twil.io/secure
client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
logging.debug(f"Connected to Twilio using MY_PHONE_NUMBER:{MY_PHONE_NUMBER},TWILIO_PHONE_NUMBER{TWILIO_PHONE_NUMBER}")
# Establish db connection
# use psycopg to connect to the db and create a table
conn = psycopg2.connect(
database=DATABASE, user=USER, password=PASSWORD, host=HOST, port=PORT)
conn.autocommit = True
cursor = conn.cursor()
# Step 1: Set up frequency, i.e. times to send messages
# Step 2: Load questions
questionsFile = open('questions.csv')
questions = csv.reader(questionsFile)
logging.debug(f"message:{questions}")
message = "\n".join([question for row in questions for question in row])
logging.debug(f"message: {message}")
# Step 3: Send questions
# message = client.messages.create(
# body=message,
# from_=TWILIO_PHONE_NUMBER,
# to=MY_PHONE_NUMBER
# )
# Step 4: Collect response
#app.route("/sms", methods=['GET', 'POST'])
def incoming_sms():
"""Send a dynamic reply to an incoming text message"""
# Get the message the user sent our Twilio number
body = request.values.get('Body', None)
# Start our TwiML response
resp = MessagingResponse()
# Determine the right reply for this message
if body == 'hello':
resp.message("Hi!")
elif body == 'bye':
resp.message("Goodbye")
return str(resp)
if __name__ == "__main__":
app.run(debug=True)
# Step 5: Create a database table as the sheet name and Save responses in db
logging.debug(f'Step 2 creating table response')
# TODO: create 10 columns for saving responses (each response contains 10 answers)
sql = f'CREATE TABLE IF NOT EXISTS public.responses'
logging.debug(f'CREATE TABLE IF NOT EXISTS public.responses')
# cursor.execute(sql)
# conn.commit()
# Next steps:
# 1. Process positive and negative sentiment from responses
# 2. Calculuate total positive sentiment
# 3. Calculate total negative sentiment
# 4. Plot positive sentiment vs. negative sentiment
The documentation doesn't provide a clear path for completing step 4.
[shows response from text message.]
[messages url]
Expected
processed responses from the questions.
Actual:
Sent from your Twilio trial account - What are the positive results or outcomes you have achieved lately?
What are the strengths and resources you have available to you to get even more results and were likely the reason you got the results in the first question.
What are your current priorities? What do you and your team need to be focused on right now?
What are the benefits to all involved-you
your team and all other stakeholders who will be impacted by achieving your priority focus.
How can we (you and/or your team) move close? What action steps are needed?
What am I going to do today?
What am I doing tomorrow ?
What did I do yesterday?
I added a Twilio flow to test the connection to Twilio. This could go in two directions just python or using a Twilio flow. The flow does not work even after adding the correct number.
messages screenshot
this comes from the demo video which does not match the current UI: https://www.youtube.com/watch?v=VRxirse1UfQ.
There's a second code sample on the page you linked. This explains how you can parse the body of incoming message to read the payload with request.values.get('Body', None):
#app.route("/sms", methods=['GET', 'POST'])
def incoming_sms():
"""Send a dynamic reply to an incoming text message"""
# Get the message the user sent our Twilio number
body = request.values.get('Body', None)
# Start our TwiML response
resp = MessagingResponse()
# Determine the right reply for this message
if body == 'hello':
resp.message("Hi!")
elif body == 'bye':
resp.message("Goodbye")
return str(resp)
However, it makes sense to ask the questions after each other and this means you would need to handle many different endpoint and it can get complicated really fast.
But there's also a visual editor that allows you to define this within a few minutes in your browser: Build a Chatbot with Twilio Studio

GRPCIO shows error while executing the pushtotalk

i am connecting the Raspberry Pi with Google Ai Assistant and it gives me the following error.
The command, which i run:
googlesamples-assistant-pushtotalk
File "/home/pi/env/bin/googlesamples-assistant-pushtotalk", line 6, in <module>
from googlesamples.assistant.grpc.pushtotalk import main
File "/home/pi/env/lib/python3.9/site-packages/googlesamples/assistant/grpc/pushtotalk.py", line 30, in <module>
from tenacity import retry, stop_after_attempt, retry_if_exception
File "/home/pi/env/lib/python3.9/site-packages/tenacity/__init__.py", line 292
from tenacity.async import AsyncRetrying
^
SyntaxError: invalid syntax```
The file:``` # Copyright (C) 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sample that implements gRPC client for Google Assistant API."""
#pip install grpc
import grpc
import json
import logging
import os.path
import click
#import grpc
import google.auth.transport.grpc
import google.auth.transport.requests
import google.oauth2.credentials
from google.assistant.embedded.v1alpha1 import embedded_assistant_pb2
from google.rpc import code_pb2
from tenacity import retry, stop_after_attempt, retry_if_exception
try:
from . import (
assistant_helpers,
audio_helpers
)
except SystemError:
import assistant_helpers
import audio_helpers
ASSISTANT_API_ENDPOINT = 'embeddedassistant.googleapis.com'
END_OF_UTTERANCE = embedded_assistant_pb2.ConverseResponse.END_OF_UTTERANCE
DIALOG_FOLLOW_ON = embedded_assistant_pb2.ConverseResult.DIALOG_FOLLOW_ON
CLOSE_MICROPHONE = embedded_assistant_pb2.ConverseResult.CLOSE_MICROPHONE
DEFAULT_GRPC_DEADLINE = 60 * 3 + 5
class SampleAssistant(object):
"""Sample Assistant that supports follow-on conversations.
Args:
conversation_stream(ConversationStream): audio stream
for recording query and playing back assistant answer.
channel: authorized gRPC channel for connection to the
Google Assistant API.
deadline_sec: gRPC deadline in seconds for Google Assistant API call.
"""
def __init__(self, conversation_stream, channel, deadline_sec):
self.conversation_stream = conversation_stream
# Opaque blob provided in ConverseResponse that,
# when provided in a follow-up ConverseRequest,
# gives the Assistant a context marker within the current state
# of the multi-Converse()-RPC "conversation".
# This value, along with MicrophoneMode, supports a more natural
# "conversation" with the Assistant.
self.conversation_state = None
# Create Google Assistant API gRPC client.
self.assistant = embedded_assistant_pb2.EmbeddedAssistantStub(channel)
self.deadline = deadline_sec
def __enter__(self):
return self
def __exit__(self, etype, e, traceback):
if e:
return False
self.conversation_stream.close()
def is_grpc_error_unavailable(e):
is_grpc_error = isinstance(e, grpc.RpcError)
if is_grpc_error and (e.code() == grpc.StatusCode.UNAVAILABLE):
logging.error('grpc unavailable error: %s', e)
return True
return False
#retry(reraise=True, stop=stop_after_attempt(3),
retry=retry_if_exception(is_grpc_error_unavailable))
def converse(self):
"""Send a voice request to the Assistant and playback the response.
Returns: True if conversation should continue.
"""
continue_conversation = False
self.conversation_stream.start_recording()
logging.info('Recording audio request.')
def iter_converse_requests():
for c in self.gen_converse_requests():
assistant_helpers.log_converse_request_without_audio(c)
yield c
self.conversation_stream.start_playback()
# This generator yields ConverseResponse proto messages
# received from the gRPC Google Assistant API.
for resp in self.assistant.Converse(iter_converse_requests(),
self.deadline):
assistant_helpers.log_converse_response_without_audio(resp)
if resp.error.code != code_pb2.OK:
logging.error('server error: %s', resp.error.message)
break
if resp.event_type == END_OF_UTTERANCE:
logging.info('End of audio request detected')
self.conversation_stream.stop_recording()
if resp.result.spoken_request_text:
logging.info('Transcript of user request: "%s".',
resp.result.spoken_request_text)
logging.info('Playing assistant response.')
if len(resp.audio_out.audio_data) > 0:
self.conversation_stream.write(resp.audio_out.audio_data)
if resp.result.spoken_response_text:
logging.info(
'Transcript of TTS response '
'(only populated from IFTTT): "%s".',
resp.result.spoken_response_text)
if resp.result.conversation_state:
self.conversation_state = resp.result.conversation_state
if resp.result.volume_percentage != 0:
self.conversation_stream.volume_percentage = (
resp.result.volume_percentage
)
if resp.result.microphone_mode == DIALOG_FOLLOW_ON:
continue_conversation = True
logging.info('Expecting follow-on query from user.')
elif resp.result.microphone_mode == CLOSE_MICROPHONE:
continue_conversation = False
logging.info('Finished playing assistant response.')
self.conversation_stream.stop_playback()
return continue_conversation
def gen_converse_requests(self):
"""Yields: ConverseRequest messages to send to the API."""
converse_state = None
if self.conversation_state:
logging.debug('Sending converse_state: %s',
self.conversation_state)
converse_state = embedded_assistant_pb2.ConverseState(
conversation_state=self.conversation_state,
)
config = embedded_assistant_pb2.ConverseConfig(
audio_in_config=embedded_assistant_pb2.AudioInConfig(
encoding='LINEAR16',
sample_rate_hertz=self.conversation_stream.sample_rate,
),
audio_out_config=embedded_assistant_pb2.AudioOutConfig(
encoding='LINEAR16',
sample_rate_hertz=self.conversation_stream.sample_rate,
volume_percentage=self.conversation_stream.volume_percentage,
),
converse_state=converse_state
)
# The first ConverseRequest must contain the ConverseConfig
# and no audio data.
yield embedded_assistant_pb2.ConverseRequest(config=config)
for data in self.conversation_stream:
# Subsequent requests need audio data, but not config.
yield embedded_assistant_pb2.ConverseRequest(audio_in=data)
#click.command()
#click.option('--api-endpoint', default=ASSISTANT_API_ENDPOINT,
metavar='<api endpoint>', show_default=True,
help='Address of Google Assistant API service.')
#click.option('--credentials',
metavar='<credentials>', show_default=True,
default=os.path.join(click.get_app_dir('google-oauthlib-tool'),
'credentials.json'),
help='Path to read OAuth2 credentials.')
#click.option('--verbose', '-v', is_flag=True, default=False,
help='Verbose logging.')
#click.option('--input-audio-file', '-i',
metavar='<input file>',
help='Path to input audio file. '
'If missing, uses audio capture')
#click.option('--output-audio-file', '-o',
metavar='<output file>',
help='Path to output audio file. '
'If missing, uses audio playback')
#click.option('--audio-sample-rate',
default=audio_helpers.DEFAULT_AUDIO_SAMPLE_RATE,
metavar='<audio sample rate>', show_default=True,
help='Audio sample rate in hertz.')
#click.option('--audio-sample-width',
default=audio_helpers.DEFAULT_AUDIO_SAMPLE_WIDTH,
metavar='<audio sample width>', show_default=True,
help='Audio sample width in bytes.')
#click.option('--audio-iter-size',
default=audio_helpers.DEFAULT_AUDIO_ITER_SIZE,
metavar='<audio iter size>', show_default=True,
help='Size of each read during audio stream iteration in bytes.')
#click.option('--audio-block-size',
default=audio_helpers.DEFAULT_AUDIO_DEVICE_BLOCK_SIZE,
metavar='<audio block size>', show_default=True,
help=('Block size in bytes for each audio device '
'read and write operation..'))
#click.option('--audio-flush-size',
default=audio_helpers.DEFAULT_AUDIO_DEVICE_FLUSH_SIZE,
metavar='<audio flush size>', show_default=True,
help=('Size of silence data in bytes written '
'during flush operation'))
#click.option('--grpc-deadline', default=DEFAULT_GRPC_DEADLINE,
metavar='<grpc deadline>', show_default=True,
help='gRPC deadline in seconds')
#click.option('--once', default=False, is_flag=True,
help='Force termination after a single conversation.')
def main(api_endpoint, credentials, verbose,
input_audio_file, output_audio_file,
audio_sample_rate, audio_sample_width,
audio_iter_size, audio_block_size, audio_flush_size,
grpc_deadline, once, *args, **kwargs):
"""Samples for the Google Assistant API.
Examples:
Run the sample with microphone input and speaker output:
$ python -m googlesamples.assistant
Run the sample with file input and speaker output:
$ python -m googlesamples.assistant -i <input file>
Run the sample with file input and output:
$ python -m googlesamples.assistant -i <input file> -o <output file>
"""
# Setup logging.
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
# Load OAuth 2.0 credentials.
try:
with open(credentials, 'r') as f:
credentials = google.oauth2.credentials.Credentials(token=None,
**json.load(f))
http_request = google.auth.transport.requests.Request()
credentials.refresh(http_request)
except Exception as e:
logging.error('Error loading credentials: %s', e)
logging.error('Run google-oauthlib-tool to initialize '
'new OAuth 2.0 credentials.')
return
# Create an authorized gRPC channel.
grpc_channel = google.auth.transport.grpc.secure_authorized_channel(
credentials, http_request, api_endpoint)
logging.info('Connecting to %s', api_endpoint)
# Configure audio source and sink.
audio_device = None
if input_audio_file:
audio_source = audio_helpers.WaveSource(
open(input_audio_file, 'rb'),
sample_rate=audio_sample_rate,
sample_width=audio_sample_width
)
else:
audio_source = audio_device = (
audio_device or audio_helpers.SoundDeviceStream(
sample_rate=audio_sample_rate,
sample_width=audio_sample_width,
block_size=audio_block_size,
flush_size=audio_flush_size
)
)
if output_audio_file:
audio_sink = audio_helpers.WaveSink(
open(output_audio_file, 'wb'),
sample_rate=audio_sample_rate,
sample_width=audio_sample_width
)
else:
audio_sink = audio_device = (
audio_device or audio_helpers.SoundDeviceStream(
sample_rate=audio_sample_rate,
sample_width=audio_sample_width,
block_size=audio_block_size,
flush_size=audio_flush_size
)
)
# Create conversation stream with the given audio source and sink.
conversation_stream = audio_helpers.ConversationStream(
source=audio_source,
sink=audio_sink,
iter_size=audio_iter_size,
sample_width=audio_sample_width,
)
with SampleAssistant(conversation_stream,
grpc_channel, grpc_deadline) as assistant:
# If file arguments are supplied:
# exit after the first turn of the conversation.
if input_audio_file or output_audio_file:
assistant.converse()
return
# If no file arguments supplied:
# keep recording voice requests using the microphone
# and playing back assistant response using the speaker.
# When the once flag is set, don't wait for a trigger. Otherwise, wait.
wait_for_user_trigger = not once
while True:
if wait_for_user_trigger:
click.pause(info='Press Enter to send a new request...')
continue_conversation = assistant.converse()
# wait for user trigger if there is no follow-up turn in
# the conversation.
wait_for_user_trigger = not continue_conversation
# If we only want one conversation, break.
if once and (not continue_conversation):
break
if __name__ == '__main__':
thanks for your kind support.

Issue in execution of event synthesis code for continuous integration for speech translation

from_language = 'fr-FR'
to_language = 'en-US'
def synthesis_callback(evt):
size = len(evt.result.audio)
print(f'Audio synthesized: {size} byte(s) {"(COMPLETED)" if size == 0 else ""}')
if size > 0:
file = open('translation.wav', 'wb+')
file.write(evt.result.audio)
file.close()
def translate_speech_to_text():
translation_config = speechsdk.translation.SpeechTranslationConfig(
subscription=speech_key, region=service_region)
translation_config.speech_recognition_language = from_language
translation_config.add_target_language(to_language)
translation_config.voice_name = "en-US-JennyNeural"
audio_input = speechsdk.AudioConfig(filename=filename)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config = audio_input )
done = False
def stop_cb(evt):
"""callback that stops continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
recognizer.stop_continuous_recognition()
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
recognizer.session_stopped.connect(stop_cb)
recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
translate_speech_to_text()
Issue: I'm trying to execute python code for event synthesis speech
translation from Azure Cognitive Speech services documentation. I have
added code from github samples for callbacks to the events for
continuous integration. Although translation starts, it gets cancelled
after first utterance. Can someone help resolve this issue?
Log after commenting below line:
recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
SESSION STARTED: SessionEventArgs(session_id=6e378e05c98249c3be3aa702fd686270)
RECOGNIZING: TranslationRecognitionEventArgs(session_id=6e378e05c98249c3be3aa702fd686270, result=TranslationRecognitionResult(result_id=e50145c39bfc463cb6284f96332a822b, translations={'en': 'Paul'}, reason=ResultReason.TranslatingSpeech))
RECOGNIZING: TranslationRecognitionEventArgs(session_id=6e378e05c98249c3be3aa702fd686270, result=TranslationRecognitionResult(result_id=ba62d3981efa418e909d6923de01a5bb, translations={'en': 'Paul Verlaine'}, reason=ResultReason.TranslatingSpeech))
RECOGNIZING: TranslationRecognitionEventArgs(session_id=6e378e05c98249c3be3aa702fd686270, result=TranslationRecognitionResult(result_id=82b2b61a950a42a096a76fd8d50ab2b3, translations={'en': 'Paul Verlaine, autumn song'}, reason=ResultReason.TranslatingSpeech))
RECOGNIZED: TranslationRecognitionEventArgs(session_id=6e378e05c98249c3be3aa702fd686270, result=TranslationRecognitionResult(result_id=65db6b16dbe443e9a6144c112d634b30, translations={'en': 'Paul Verlaine, autumn song.'}, reason=ResultReason.TranslatedSpeech))
CLOSING on TranslationRecognitionCanceledEventArgs(session_id=6e378e05c98249c3be3aa702fd686270, result=TranslationRecognitionResult(result_id=29fb7547f33a4a4d968ebeae997c84a8, translations={}, reason=ResultReason.Canceled))
First, you can add a line in canceled event callback to print the cancellation details
recognizer.canceled.connect(lambda evt: print('CANCELED {}, error_detail={}'.format(evt, evt.result.cancellation_details.error_details)))
In your case, the error detail is Synthesis service failed with code: - Could not identify the voice 'en-US-JennyNeural' for the text to speech service. For speech-to-speech translation, neural voices are not supported for now. You can refer to Manual synthesis section of this doc to synthesize the translated text.

How to set single_utterance for Google Cloud Speech-to-Text API using Speech Recognition with Python

I'm writing a simple calculator using Google API and need to store the recognized text from stream audio input to process it further. The API has a single_utterance config option that seems to be extremely helpful, yet I can't find a way to set it.
In Cloud Text-to-Speech library there's a line that says:
set the single_utterance field to true in the StreamingRecognitionConfig object.
Example code on Speech Recognition github
#!/usr/bin/env python3
# NOTE: this example requires PyAudio because it uses the Microphone class
from threading import Thread
try:
from queue import Queue # Python 3 import
except ImportError:
from Queue import Queue # Python 2 import
import speech_recognition as sr
r = sr.Recognizer()
audio_queue = Queue()
def recognize_worker():
# this runs in a background thread
while True:
audio = audio_queue.get() # retrieve the next audio processing job from the main thread
if audio is None: break # stop processing if the main thread is done
# received audio data, now we'll recognize it using Google Speech Recognition
try:
# for testing purposes, we're just using the default API key
# to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
# instead of `r.recognize_google(audio)`
print("Google Speech Recognition thinks you said " + r.recognize_google(audio))
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
audio_queue.task_done() # mark the audio processing job as completed in the queue
# start a new thread to recognize audio, while this thread focuses on listening
recognize_thread = Thread(target=recognize_worker)
recognize_thread.daemon = True
recognize_thread.start()
with sr.Microphone() as source:
try:
while True: # repeatedly listen for phrases and put the resulting audio on the audio processing job queue
audio_queue.put(r.listen(source))
except KeyboardInterrupt: # allow Ctrl + C to shut down the program
pass
audio_queue.join() # block until all current audio processing jobs are done
audio_queue.put(None) # tell the recognize_thread to stop
recognize_thread.join() # wait for the recognize_thread to actually stop
Recognizer class from the same github
def recognize_google(self, audio_data, key=None, language="en-US", pfilter=0, show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string"
assert isinstance(language, str), "``language`` must be a string"
flac_data = audio_data.get_flac_data(
convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz
convert_width=2 # audio samples must be 16-bit
)
if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({
"client": "chromium",
"lang": language,
"key": key,
"pFilter": pfilter
}))
request = Request(url, data=flac_data, headers={"Content-Type": "audio/x-flac; rate={}".format(audio_data.sample_rate)})
# obtain audio transcription results
try:
response = urlopen(request, timeout=self.operation_timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError("recognition connection failed: {}".format(e.reason))
response_text = response.read().decode("utf-8")
# ignore any blank blocks
actual_result = []
for line in response_text.split("\n"):
if not line: continue
result = json.loads(line)["result"]
if len(result) != 0:
actual_result = result[0]
break
# return results
if show_all: return actual_result
if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: raise UnknownValueError()
if "confidence" in actual_result["alternative"]:
# return alternative with highest confidence score
best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis = actual_result["alternative"][0]
if "transcript" not in best_hypothesis: raise UnknownValueError()
return best_hypothesis["transcript"]
The recognize_google() function in Recognizer class seems to have no passable argument that could set the single_utterance field to true.
Apart from the github, Google docs don't really include the Speech Recognition library.
I've tried to change the key in order to connect using my own API and take a look from console's perspective, but I've ran into problems there as well.

[python]I want to merge two code, but occur webpy error type 'exceptions.keyerror'

I want merge two code.
A.py is webpy code.
B.py is Google cloud speech(STT) example code.
but when I merge two code, it occurs webpy error
type 'exceptions.keyerror'
I insert A.py code to B.py in main() first line.
how to merge this code?
This is A.py
import web
urls = ("/.*", "hello")
app = web.application(urls, globals())
class hello:
def GET(self):
return 'Hello, world!'
if __name__ == "__main__":
app.run()
This is B.py(Google colud speech(STT) example code)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sample that streams audio to the Google Cloud Speech API via GRPC."""
from __future__ import division
import contextlib
import functools
import re
import signal
import sys
import web
import google.auth
import google.auth.transport.grpc
import google.auth.transport.requests
from google.cloud.proto.speech.v1beta1 import cloud_speech_pb2
from google.rpc import code_pb2
import grpc
import pyaudio
from six.moves import queue
# Audio recording parameters
RATE = 48000
CHUNK = int(RATE / 10) # 100ms
# The Speech API has a streaming limit of 60 seconds of audio*, so keep the
# connection alive for that long, plus some more to give the API time to figure
# out the transcription.
# * https://g.co/cloud/speech/limits#content
DEADLINE_SECS = 60 * 3 + 5
SPEECH_SCOPE = 'https://www.googleapis.com/auth/cloud-platform'
def make_channel(host, port):
"""Creates a secure channel with auth credentials from the environment."""
# Grab application default credentials from the environment
credentials, _ = google.auth.default(scopes=[SPEECH_SCOPE])
# Create a secure channel using the credentials.
http_request = google.auth.transport.requests.Request()
target = '{}:{}'.format(host, port)
return google.auth.transport.grpc.secure_authorized_channel(
credentials, http_request, target)
def _audio_data_generator(buff):
"""A generator that yields all available data in the given buffer.
Args:
buff - a Queue object, where each element is a chunk of data.
Yields:
A chunk of data that is the aggregate of all chunks of data in `buff`.
The function will block until at least one data chunk is available.
"""
stop = False
while not stop:
# Use a blocking get() to ensure there's at least one chunk of data.
data = [buff.get()]
# Now consume whatever other data's still buffered.
while True:
try:
data.append(buff.get(block=False))`enter code here`
except queue.Empty:
break
# `None` in the buffer signals that the audio stream is closed. Yield
# the final bit of the buffer and exit the loop.
if None in data:
stop = True
data.remove(None)
yield b''.join(data)
def _fill_buffer(buff, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
buff.put(in_data)
return None, pyaudio.paContinue
# [START audio_stream]
#contextlib.contextmanager
def record_audio(rate, chunk):
"""Opens a recording stream in a context manager."""
# Create a thread-safe buffer of audio data
buff = queue.Queue()
audio_interface = pyaudio.PyAudio()
audio_stream = audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1, rate=rate,
input=True, frames_per_buffer=chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't overflow
# while the calling thread makes network requests, etc.
stream_callback=functools.partial(_fill_buffer, buff),
)
yield _audio_data_generator(buff)
audio_stream.stop_stream()
audio_stream.close()
# Signal the _audio_data_generator to finish
buff.put(None)
audio_interface.terminate()
# [END audio_stream]
def request_stream(data_stream, rate, interim_results=True):
"""Yields `StreamingRecognizeRequest`s constructed from a recording audio
stream.
Args:
data_stream: A generator that yields raw audio data to send.
rate: The sampling rate in hertz.
interim_results: Whether to return intermediate results, before the
transcription is finalized.
"""
# The initial request must contain metadata about the stream, so the
# server knows how to interpret it.
recognition_config = cloud_speech_pb2.RecognitionConfig(
# There are a bunch of config options you can specify. See
encoding='LINEAR16', # raw 16-bit signed LE samples
sample_rate=rate, # the rate in hertz
# See http://g.co/cloud/speech/docs/languages
# for a list of supported languages.
language_code='ko-KR', # a BCP-47 language tag
)
streaming_config = cloud_speech_pb2.StreamingRecognitionConfig(
interim_results=interim_results,
config=recognition_config,
)
yield cloud_speech_pb2.StreamingRecognizeRequest(
streaming_config=streaming_config)
for data in data_stream:
# Subsequent requests can all just have the content
yield cloud_speech_pb2.StreamingRecognizeRequest(audio_content=data)
def listen_print_loop(recognize_stream):
"""Iterates through server responses and prints them.
The recognize_stream passed is a generator that will block until a response
is provided by the server. When the transcription response comes, print it.
In this case, responses are provided for interim results as well. If the
response is an interim one, print a line feed at the end of it, to allow
the next result to overwrite it, until the response is a final one. For the
final one, print a newline to preserve the finalized transcription.
"""
num_chars_printed = 0
for resp in recognize_stream:
if resp.error.code != code_pb2.OK:
raise RuntimeError('Server error: ' + resp.error.message)
if not resp.results:
continue
# Display the top transcription
result = resp.results[0]
transcript = result.alternatives[0].transcript
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
#
# If the previous result was longer than this one, we need to print
# some extra spaces to overwrite the previous result
overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript))
if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + '\r')
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print(transcript + overwrite_chars)
# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r'\b(exit|quit)\b', transcript, re.I):
print('Exiting..')
break
num_chars_printed = 0
def main():
urls = ("/.*", "hello")
app = web.application(urls, globals())
class hello:
def GET(self):
return 'Hello, world!'
app.run()
service = cloud_speech_pb2.SpeechStub(
make_channel('speech.googleapis.com', 443))
# For streaming audio from the microphone, there are three threads.
# First, a thread that collects audio data as it comes in
with record_audio(RATE, CHUNK) as buffered_audio_data:
# Second, a thread that sends requests with that data
requests = request_stream(buffered_audio_data, RATE)
# Third, a thread that listens for transcription responses
recognize_stream = service.StreamingRecognize(
requests, DEADLINE_SECS)
# Exit things cleanly on interrupt
signal.signal(signal.SIGINT, lambda *_: recognize_stream.cancel())
# Now, put the transcription responses to use.
try:
listen_print_loop(recognize_stream)
recognize_stream.cancel()
except grpc.RpcError as e:
code = e.code()
# CANCELLED is caused by the interrupt handler, which is expected.
if code is not code.CANCELLED:
raise
if __name__ == '__main__':
main()
The error refers to the fact the web.py will be looking for a class hello, accessible from global scope. You've defined your class hello within main(). web.py will never find it.
That being said, there are other issues. Your call to app.run() within main starts the web.py webserver and never returns so nothing after that will ever get executed.
Combining two code examples requires one to understand both snippets. Read the docs, and keep trying.

Categories

Resources