Custom audio input bytes to azure cognitive speech translation service in Python

Custom audio input bytes to azure cognitive speech translation service in Python - python

I am in need to able to translate custom audio bytes which I can get from any source and translate the voice into the language I need (currently Hindi). I have been trying to pass custom audio bytes using following code in Python:
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
class CustomPullAudioInputStreamCallback(PullAudioInputStreamCallback):
def __init__(self):
return super(CustomPullAudioInputStreamCallback, self).__init__()
def read(self, file_bytes):
print (len(file_bytes))
return len(file_bytes)
def close(self):
return super(CustomPullAudioInputStreamCallback, self).close()
class CustomPushAudioInputStream(PushAudioInputStream):
def write(self, file_bytes):
print (type(file_bytes))
return super(CustomPushAudioInputStream, self).write(file_bytes)
def close():
return super(CustomPushAudioInputStream, self).close()
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
# pull_audio_input_stream = PullAudioInputStream(pull_audio_input_stream_callback, audioFormat)
# custom_pull_audio_input_stream = CustomPushAudioInputStream(audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream_callback)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config,
audio_config=audio_config)
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
recognizer.stop_continuous_recognition_async()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
def receiving_bytes(audio_bytes):
# audio_bytes contain bytes of audio to be translated
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
pull_audio_input_stream_callback.read(audio_bytes)
recognizer.start_continuous_recognition_async()
receiving_bytes(audio_bytes)
Output:
Error: AttributeError: 'PullAudioInputStreamCallback' object has no attribute '_impl'
Packages and their versions:
Python 3.6.3
azure-cognitiveservices-speech 1.11.0
File Translation can be successfully performed but I do not want to save files for each chunk of bytes I receive.
Can you me pass custom audio bytes to the Azure Speech Translation Service and get the result in Python? If yes then how?

I got the solution to the problem by myself. I think it works with PullAudioInputStream too. But it worked for me using PushAudioInputStream. You don't need to create custom classes it would work like the following:
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.audio import AudioStreamFormat, PullAudioInputStream, PullAudioInputStreamCallback, AudioConfig, PushAudioInputStream
from threading import Thread, Event
speech_key, service_region = "key", "region"
channels = 1
bitsPerSample = 16
samplesPerSecond = 16000
audioFormat = AudioStreamFormat(samplesPerSecond, bitsPerSample, channels)
translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=speech_key, region=service_region)
fromLanguage = 'en-US'
toLanguage = 'hi'
translation_config.speech_recognition_language = fromLanguage
translation_config.add_target_language(toLanguage)
translation_config.voice_name = "hi-IN-Kalpana-Apollo"
# Remove Custom classes as they are not needed.
custom_push_stream = speechsdk.audio.PushAudioInputStream(stream_format=audioFormat)
audio_config = AudioConfig(stream=custom_push_stream)
recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
# Create an event
synthesis_done = Event()
def synthesis_callback(evt):
size = len(evt.result.audio)
print('AUDIO SYNTHESIZED: {} byte(s) {}'.format(size, '(COMPLETED)' if size == 0 else ''))
if size > 0:
t_sound_file = open("translated_output.wav", "wb+")
t_sound_file.write(evt.result.audio)
t_sound_file.close()
# Setting the event
synthesis_done.set()
def recognized_complete(evt):
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print("RECOGNIZED '{}': {}".format(fromLanguage, result.text))
print("TRANSLATED into {}: {}".format(toLanguage, result.translations['hi']))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("RECOGNIZED: {} (text could not be translated)".format(result.text))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print("NOMATCH: Speech could not be recognized: {}".format(result.no_match_details))
elif evt.reason == speechsdk.ResultReason.Canceled:
print("CANCELED: Reason={}".format(result.cancellation_details.reason))
if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
print("CANCELED: ErrorDetails={}".format(result.cancellation_details.error_details))
recognizer.synthesizing.connect(synthesis_callback)
recognizer.recognized.connect(recognized_complete)
# Read and get data from an audio file
open_audio_file = open("speech_wav_audio.wav", 'rb')
file_bytes = open_audio_file.read()
# Write the bytes to the stream
custom_push_stream.write(file_bytes)
custom_push_stream.close()
# Start the recognition
recognizer.start_continuous_recognition()
# Waiting for the event to complete
synthesis_done.wait()
# Once the event gets completed you can call Stop recognition
recognizer.stop_continuous_recognition()
I have used Event from thread since start_continuous_recognition starts in a different thread and you won't get data from callback events if you don't use threading. synthesis_done.wait will solve this problem by waiting for the event to complete and only then will call the stop_continuous_recognition. Once you obtain the audio bytes you can do whatever you wish in the synthesis_callback. I have simplified the example and took bytes from a wav file.

The example code provided uses a callback as the stream parameter to AudioConfig, which doesn’t seem to be allowed.
This code should work without throwing an error:
pull_audio_input_stream_callback = CustomPullAudioInputStreamCallback()
pull_audio_input_stream = PullAudioInputStream(pull_stream_callback=pull_audio_input_stream_callback, stream_format=audioFormat)
audio_config = AudioConfig(use_default_microphone=False, stream=pull_audio_input_stream)

Related

Get data as dataframe from Look through API sdk package in python

I have the following code to get a look from Looker through an API. I stored the api in a .ini. All of that works, but now I want to get the data from this look as Dataframe in python, so that I can use the data for further analysis. How can i do that? I used this code, but that only saves it to png. I can't find a way to create a dataframe from this, as I want the data itself and not just the outcome image.
import sys
import textwrap
import time
import looker_sdk
from looker_sdk import models
sdk = looker_sdk.init40("/Name.ini")
def get_look(title: str) -> models.Look:
title = title.lower()
look = next(iter(sdk.search_looks(title=title)), None)
if not look:
raise Exception(f"look '{title}' was not found")
return look
def download_look(look: models.Look, result_format: str, width: int, height: int):
"""Download specified look as png/jpg"""
id = int(look.id)
task = sdk.create_look_render_task(id, result_format, width, height,)
if not (task and task.id):
raise sdk.RenderTaskError(
f"Could not create a render task for '{look.title}'"
)
# poll the render task until it completes
elapsed = 0.0
delay = 0.5 # wait .5 seconds
while True:
poll = sdk.render_task(task.id)
if poll.status == "failure":
print(poll)
raise Exception(f"Render failed for '{look.title}'")
elif poll.status == "success":
break
time.sleep(delay)
elapsed += delay
print(f"Render task completed in {elapsed} seconds")
result = sdk.render_task_results(task.id)
filename = f"{look.title}.{result_format}"
with open(filename, "wb") as f:
f.write(result)
print(f"Look saved to '{filename}'")
look_title = sys.argv[1] if len(sys.argv) > 1 else "Name"
image_width = int(sys.argv[2]) if len(sys.argv) > 2 else 545
image_height = int(sys.argv[3]) if len(sys.argv) > 3 else 842
image_format = sys.argv[4] if len(sys.argv) > 4 else "png"
if not look_title:
raise Exception(
textwrap.dedent(
"""
Please provide: <lookTitle> [<img_width>] [<img_height>] [<img_format>]
img_width defaults to 545
img_height defaults to 842
img_format defaults to 'png'"""
)
)
look = get_look(look_title)
#Dataframe storage
download_look(look, image_format, image_width, image_height)

The SDK function you are using (create_look_render_task), which is described here only allows you to download in either pdf, png, or jpg.
If you want to get the data from a Look into a dataframe then you may want to look into using the run_look function instead described here. When you use run_look you can change the result_format to CSV and then write your own code to convert to a dataframe.

Python for/while loop

Today i am working on a project about incoming phone calls being transcripted and getting saved into text files, but i am also kinda new to python and python loops.
I want to loop over a SQL server column and let each row loop trough the azure Speech to text service i use (all of the phonecall OID's). I have been stuck on this problem for a couple days now so i thought i might find some help here.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
import sys
import os.path
import pyodbc
databaseName = '*'
username = '*'
password = '*'
server = '*'
driver = '*'
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
while row:
array = [(int(row[1]))]
row = cursor.fetchone()
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
Telefoongesprek = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
with open("Telefoongesprek.mp3", "wb") as f:
f.write(Telefoongesprek.content)
src = "Telefoongesprek.mp3"
dst = "Telefoongesprek.wav"
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
speech_recognize_continuous_from_file()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
everything works apart form each other but i just dont know how to place the loop and witch one i should use (For/While loop). right here im trying to loop over an array but i dont this this is correct.
Error message: Decoding failed. ffmpeg returned error code: 1
[mp3 # 000001cb8c57e0o0] Failed to read frame size: could not seek to 1073.
which i am pretty sure means that my azure function can't find an mp3 file, what means that the "Mp3 to Wav" convert doesn't work.
Thanks in advance!

If I understand your question, you have a database with lots of phone call details. One of the field value in each row is used to create the associated mp3 file. You want to do speech to text using azure on each of the mp3 file you have in your database.
So you can do it in two ways:
Iterate though all rows in the database and create all the associted files into a folder in the local disk with the OID as your filename.
Then write another loop to iterate through this folder and send the files for transcription to Azure Speech to Text service.
The other technique is to do everything in a single loop like the way you have shown which will require some corrections.
Ok, so now that part is clear, we can go into the speech to text part. So azure allow you to send the compressed format for transcription, which means you actually don't need to convert it into wav file.
Please have a look at the modified code below with the changes:
# code snippet borrowed from azure samples
def speech_recognize_continuous_from_file(filename):
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=filename)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
# loop through the rows
while row:
array = [(int(row[1]))]
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
telefoongesprek_response = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
# save the file to local disk as mp3
with open("Telefoongesprek.mp3", "wb") as f:
f.write(telefoongesprek_response.content)
# do the speech to text on the mp3 file
speech_recognize_continuous_from_file(f.name)
# fetch the next row
row = cursor.fetchone()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
I haven't tested this full code as i don't have the db connections with me. Please fell free to modify for your use case and let me know if you have any issues.

Streaming speech recognition with Google Speech-to-Text is leading to improperly timestamped transcripts

My Problem:
The web app I'm building relies on real-time transcription of a user's voice along with timestamps for when each word begins and ends.
Google's Speech-to-Text API has a limit of 4 minutes for streaming requests but I want users to be able to run their mic's for as long as 30 minutes if they so choose.
Thankfully, Google provides its own code examples for how to make successive requests to their Speech-to-Text API in a way that mimics endless streaming speech recognition.
I've adapted their Python infinite streaming example for my purposes (see below for my code). The timestamps provided by Google are pretty accurate but the issue is that when I exceed the streaming limit (4 minutes) and a new request is made, the timestamped transcript returned by Google's API from the new request is off by as much as 5 seconds or more.
Below is an example of the output when I adjust the streaming limit to 10 seconds (so a new request to Google's Speech-to-Text API begins every 10 seconds).
The timestamp you see printed next to each transcribed response (the 'corrected_time' in the code) is the timestamp for the end of the transcribed line, not the beginning. These timestamps are accurate for the first request but are off by ~4 seconds in the second request and ~9 seconds in the third request.
In a Nutshell, I want to make sure that when the streaming limit is exceeded and a new request is made, the timestamps returned by Google for that new request are adjusted accurately.
My Code:
To help you understand what's going on, I would recommend running it on your machine (only takes a couple of minutes to get working if you have a Google Cloud service account).
I've included more detail on my current diagnosis below the code.
#!/usr/bin/env python
"""Google Cloud Speech API sample application using the streaming API.
NOTE: This module requires the dependencies `pyaudio`.
To install using pip:
pip install pyaudio
Example usage:
python THIS_FILENAME.py
"""
# [START speech_transcribe_infinite_streaming]
import os
import re
import sys
import time
from google.cloud import speech
import pyaudio
from six.moves import queue
# Audio recording parameters
STREAMING_LIMIT = 20000 # 20 seconds (originally 4 mins but shortened for testing purposes)
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms
# Environment Variable set for Google Credentials. Put the json service account
# key in the root directory
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'YOUR_SERVICE_ACCOUNT_KEY.json'
def get_current_time():
"""Return Current Time in MS."""
return int(round(time.time() * 1000))
class ResumableMicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk_size):
self._rate = rate
self.chunk_size = chunk_size
self._num_channels = 1
self._buff = queue.Queue()
self.closed = True
self.start_time = get_current_time()
self.restart_counter = 0
self.audio_input = []
self.last_audio_input = []
self.result_end_time = 0
self.is_final_end_time = 0
self.final_request_end_time = 0
self.bridging_offset = 0
self.last_transcript_was_final = False
self.new_stream = True
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=self._num_channels,
rate=self._rate,
input=True,
frames_per_buffer=self.chunk_size,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
def __enter__(self):
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, *args, **kwargs):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self):
"""Stream Audio from microphone to API and to local buffer"""
while not self.closed:
data = []
"""
THE BELOW 'IF' STATEMENT IS WHERE THE ERROR IS LIKELY OCCURRING
This statement runs when the streaming limit is hit and a new request is made.
"""
if self.new_stream and self.last_audio_input:
chunk_time = STREAMING_LIMIT / len(self.last_audio_input)
if chunk_time != 0:
if self.bridging_offset < 0:
self.bridging_offset = 0
if self.bridging_offset > self.final_request_end_time:
self.bridging_offset = self.final_request_end_time
chunks_from_ms = round(
(self.final_request_end_time - self.bridging_offset)
/ chunk_time
)
self.bridging_offset = round(
(len(self.last_audio_input) - chunks_from_ms) * chunk_time
)
for i in range(chunks_from_ms, len(self.last_audio_input)):
data.append(self.last_audio_input[i])
self.new_stream = False
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
self.audio_input.append(chunk)
if chunk is None:
return
data.append(chunk)
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
self.audio_input.append(chunk)
except queue.Empty:
break
yield b"".join(data)
def listen_print_loop(responses, stream):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
Each response may contain multiple results, and each result may contain
multiple alternatives; Here we print only the transcription for the top
alternative of the top result.
In this case, responses are provided for interim results as well. If the
response is an interim one, print a line feed at the end of it, to allow
the next result to overwrite it, until the response is a final one. For the
final one, print a newline to preserve the finalized transcription.
"""
for response in responses:
if get_current_time() - stream.start_time > STREAMING_LIMIT:
stream.start_time = get_current_time()
break
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript = result.alternatives[0].transcript
result_seconds = 0
result_micros = 0
if result.result_end_time.seconds:
result_seconds = result.result_end_time.seconds
if result.result_end_time.microseconds:
result_micros = result.result_end_time.microseconds
stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000))
corrected_time = (
stream.result_end_time
- stream.bridging_offset
+ (STREAMING_LIMIT * stream.restart_counter)
)
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
if result.is_final:
sys.stdout.write("FINAL RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\n")
stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True
# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
sys.stdout.write("Exiting...\n")
stream.closed = True
break
else:
sys.stdout.write("INTERIM RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\r")
stream.last_transcript_was_final = False
def main():
"""start bidirectional streaming from microphone input to speech API"""
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="en-US",
max_alternatives=1,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
print(mic_manager.chunk_size)
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
sys.stdout.write("End (ms) Transcript Results/Status\n")
sys.stdout.write("=====================================================\n")
with mic_manager as stream:
while not stream.closed:
sys.stdout.write(
"\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n"
)
stream.audio_input = []
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = client.streaming_recognize(streaming_config, requests)
# Now, put the transcription responses to use.
listen_print_loop(responses, stream)
if stream.result_end_time > 0:
stream.final_request_end_time = stream.is_final_end_time
stream.result_end_time = 0
stream.last_audio_input = []
stream.last_audio_input = stream.audio_input
stream.audio_input = []
stream.restart_counter = stream.restart_counter + 1
if not stream.last_transcript_was_final:
sys.stdout.write("\n")
stream.new_stream = True
if __name__ == "__main__":
main()
# [END speech_transcribe_infinite_streaming]
My Current Diagnosis
The 'corrected_time' is not being set correctly when new requests are made. This is due to the 'bridging_offset' not being set correctly. So what we need to look at is the 'generator()' method in the 'ResumableMicrophoneStream' class.
In the 'generator()' method, there is an 'if' statement which is run when the streaming limit is hit and a new request is made
if self.new_stream and self.last_audio_input:
Its purpose appears to be to take any lingering audio data that wasn't finished being transcribed before the streaming limit was hit and add it to the buffer before any new audio chunks so that it's transcribed in the new request.
It is also the responsibility of this 'if' statement to set the 'bridging offset' but I'm not entirely sure what this offset represents. All I know is that however it is being set, it is not being set accurately.

Time offset values show the beginning and the end of each spoken word
that is recognized in the supplied audio. A time offset value
represents the amount of time that has elapsed from the beginning of
the audio, in increments of 100ms.
This tells us that the offset you are receiving for each of the timestamps that you are running within your project will always make the timestamps from start to finish. That would be my guess as to why it’s causing your application problems.

how to add transfer syntax uid to the filemeta of dataset

I use pydicom library to generate .dcm files using the datasets coming from the CT and MRI machines, however in that dataset, the tag (0002,0010) is missing. As I dont have that tag, I am not able to detect the transfer syntax whether it is implicit VR little endian, explicit VR little endian, jpeg lossless etc. I need the transfer syntax for saving the dataset with flags like below
ds.is_little_endian = True
ds.is_implicit_VR = False
ds.file_meta.TransferSyntaxUID = JPEGLossless
ds.is_explicit_VR = True etc
If I dont use the above flags, then the dcm file that is generated will not be valid as there is no transfer syntax.
Since I dont know the transfer syntax, I am sending the transfer syntax in command line arguments when I run the program, and setting the above flags accordingly, and saving the dataset. I know this is wrong method but I just used it as a temporary solution. Is there any better way to detect the transfer syntax please as the tag (0002, 0010) is missing.
Below is my code that I use for saving the dcm file using the dataset coming from the CT machine. For now i am sending the transfer syntax as command line argument
from pynetdicom3 import AE, VerificationPresentationContexts, StoragePresentationContexts, QueryRetrievePresentationContexts
from pydicom.uid import ImplicitVRLittleEndian, ExplicitVRLittleEndian, JPEGLossless
from pynetdicom3.sop_class import CTImageStorage, MRImageStorage
from pynetdicom3 import pynetdicom_uid_prefix
from pydicom.dataset import Dataset, FileDataset
import argparse
import uuid
import os
import django
import logging
ttt = []
ttt.extend(VerificationPresentationContexts)
ttt.extend(StoragePresentationContexts)
ttt.extend(QueryRetrievePresentationContexts)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'lh_dcm_viewer.settings')
django.setup()
from dcm_app.models import DCMFile, DCMFileException
def _setup_argparser():
parser = argparse.ArgumentParser(
description="The getscp application implements a Service Class "
"Provider (SCP) for the Query/Retrieve (QR) Service Class "
"and the Basic Worklist Management (BWM) Service Class. "
"getscp only supports query functionality using the C-GET "
"message. It receives query keys from an SCU and sends a "
"response. The application can be used to test SCUs of the "
"QR and BWM Service Classes.",
usage="getscp [options] port")
# Parameters
# Transfer Syntaxes
ts_opts = parser.add_argument_group('Preferred Transfer Syntaxes')
ts_opts.add_argument("--type",
help="prefer explicit VR local byte order (default)")
ts_opts.add_argument("--detect_transfer_syntax",
help="Detect transfer syntax")
ts_opts.add_argument("--port",
help="port at which the SCP listens")
return parser.parse_args()
args = _setup_argparser()
ae = AE(ae_title=b'MY_ECHO_SCP', port=int(args.port))
if args.type == "jpeg_lossless":
ae.add_supported_context('1.2.840.10008.1.2.4.57')
elif args.type == "implicit":
ae.add_supported_context('1.2.840.10008.1.2')
print("ImplicitVRLittleEndian")
elif args.type == "explicit":
ae.add_supported_context('1.2.840.10008.1.2.1')
ae.requested_contexts = ttt
ae.supported_contexts = ttt
DICOM_IP = '192.168.1.11'
DICOM_IP = '127.0.0.1'
DICOM_PORT = 5678
def save_file(dataset, context, info):
try:
random_str = uuid.uuid4().hex
meta = Dataset()
meta.MediaStorageSOPClassUID = dataset.SOPClassUID
meta.MediaStorageSOPInstanceUID = dataset.SOPInstanceUID
meta.ImplementationClassUID = pynetdicom_uid_prefix
meta.FileMetaInformationGroupLength = 202
received_file_path = "../received_dcms/%s.dcm" % random_str
dataset_vr = None
try:
dataset_vr = dataset[("6000", "3000")].VR
except KeyError:
pass
print(dataset_vr)
if args.type == "implicit" or dataset_vr == "OB or OW":
ds = FileDataset(received_file_path, {}, file_meta=meta, preamble=b"\0" * 128)
ds.update(dataset)
ds.is_little_endian = True
ds.is_implicit_VR = True
if(dataset_vr == "OB or OW"):
print("forced ImplicitVRLittleEndian")
ds.file_meta.TransferSyntaxUID = ImplicitVRLittleEndian
elif args.type == "jpeg_lossless":
ds = FileDataset(received_file_path, {}, file_meta=meta, preamble=b"\0" * 128)
ds.update(dataset)
ds.is_little_endian = True
ds.is_implicit_VR = False
ds.file_meta.TransferSyntaxUID = JPEGLossless
ds.is_explicit_VR = True
elif args.type == "explicit":
meta.TransferSyntaxUID = "1.2.840.10008.1.2.1"
ds = FileDataset(received_file_path, {}, file_meta=meta, preamble=b"\0" * 128)
ds.update(dataset)
ds.is_little_endian = True
ds.is_implicit_VR = False
ds.file_meta.TransferSyntaxUID = ExplicitVRLittleEndian
ds.is_explicit_VR = True
ds.save_as(received_file_path)
f = open(received_file_path, "rb")
f.close()
return 0XC200
return status
except Exception as e:
logger.error(e)
ae.on_c_store = save_file
ae.start()

I hate to send you away but this is a very specialized tool and it has it's own support community. I looked up your issue and found it might be a bug. There is a user who is having a similar problem and he said the transfer syntax is not missing if he uses an older version of the library.
SIMILAR ISSUE:
https://groups.google.com/forum/#!topic/pydicom/Oxd7cbCwseU
Support for this library is found here:
https://groups.google.com/forum/#!forum/pydicom
The code itself is maintained on GitHub where you can open bug reports and view all other bug reports: https://github.com/pydicom/pydicom/issues

Python - _ctypes.COMError -2147287038 (comtypes module)

EDIT: I came here by chance and realized I didn't put here who the code is from (???). Sorry about that... I knew programming only for 2 or 3 months when I published this question. Didn't even think on that. The library is called TTS and is from DeepHorizons (thank you so much for the library!): https://github.com/DeepHorizons/tts - and seems it was updated since I asked this.
I've been trying to record all words from the english dictionary with a python module I found on github (modified by me). I'm trying to record every word from IVONA 2 Brian to the case that the internet fails, I can have the same voice on my Raspberry Pi and not espeak or pico tts voices. But my problem is that I can record only about 81.000 on the first time, then it goes to 7.000 and then to 2.000 and I need 479.000. The file with the list of words is found on github too (each word is in a new line). My script's code is:
#!/usr/bin/env python3
"""
This ia a helper to use the Microsoft SAPI
Needs to run on a windows system
Requires comtypes
"""
import os
import comtypes.client # Importing comtypes.client will make the gen subpackage
try:
from comtypes.gen import SpeechLib # comtypes
except ImportError:
# Generate the SpeechLib lib and any associated files
engine = comtypes.client.CreateObject("SAPI.SpVoice")
stream = comtypes.client.CreateObject("SAPI.SpFileStream")
from comtypes.gen import SpeechLib
class Sapi(object):
"""A speech API using Microsofts SAPI through COM"""
def __init__(self):
super().__init__()
self.voice = comtypes.client.CreateObject('Sapi.SpVoice')
def get_voices(self, name=''):
"""Get a list of voices, search by name optional"""
voice_list = []
voices = self.voice.GetVoices()
if name is not '':
for voice in voices:
if name in voice.GetDescription():
voice_list.append(voice)
break
else:
print('Voice not found')
else:
for voice in voices:
voice_list.append(voice)
return voice_list
def get_voice_names(self):
"""Get the names of all the voices"""
return [voice.GetDescription() for voice in self.get_voices()]
def set_voice(self, voice):
"""Set the voice to the given voice"""
if type(voice) is str:
self.voice.Voice = self.get_voices(voice)[0]
else:
self.voice.Voice = voice
return
def get_audio_outputs(self, name=''):
"""Get the audio outputs, search for the one with the name if given"""
output_list = []
outputs = self.voice.GetAudioOutputs()
if name is not '':
for output in outputs:
if name in output.GetDescription():
output_list.append(output)
break
else:
print('Audio output not found')
else:
for output in outputs:
output_list.append(output)
return output_list
def get_audio_output_names(self):
"""Get the names of all the audio outpus"""
return [output.GetDescription() for output in self.get_audio_outputs()]
def set_audio_output(self, output):
if type(output) is str:
self.voice.AudioOutput = self.get_audio_outputs(output)[0]
else:
self.voice.AudioOutput = output
return
def say(self, message):
self.voice.Speak(message)
return
def set_rate(self, rate):
"""Set the speed of the speaker
-10 is slowest, 10 is fastest"""
self.voice.Rate = rate
def _create_stream(self, filename):
"""Create a file stream handler"""
stream = comtypes.client.CreateObject('Sapi.SpFileStream')
stream.Open(filename, SpeechLib.SSFMCreateForWrite)
return stream
def create_recording(self, filename, message):
"""Make a recording of the given message to the file
The file should be a .wav as the output is
PCM 22050 Hz 16 bit, Little engianness, Signed"""
stream = self._create_stream(filename)
temp_stream = self.voice.AudioOutputStream
self.voice.AudioOutputStream = stream
self.say(message)
self.voice.AudioOutputStream = temp_stream
if __name__ == '__main__':
v = Sapi()
#From here is my code, above is the code from the module I found.
with open("words.txt","r") as words:
lines=words.read().splitlines()
words.close()
num_total=(len(lines))
os.mkdir("list")
num=0
for e in lines:
word=""
for i in e:
if i=="/" or i==":" or i=="*" or i=="?" or i=="<" or i==">" or i=="|":
word+=" "
else:
word+=i
v.set_voice("Brian")
v.set_rate(+1)
v.create_recording("list/"+word+".wav", e)
I'm using Sublime Text 3 and what shows up in the interpreter is:
I have no idea of why this is stopping. It stops, I delete the words recorded from the file and then I run the script and it can run, but it lowers the number of recorded words.
Can someone please explain me why this is happening?
EDIT: The same happens with Microsoft Anna, if it helps anyone to understand this...
Thanks in advance.

After some time, I thought I should divide the words.txt file in smaller ones (10.000 words each) with a python script and print each word before it was recorded to see what was the problem. It stopped for some seconds on 'aux' and then on 'aux.' (with no errors and I don't know why), and it didn't recorded those ones. It gave the error with 'con' and 'con.'. Knowing that, I tried to record those 4 words with eSpeak TTS App. It recorded them, but I couldn't save them in anywhere because those names of files are protected by Windows to be used by it. So, now the script looks like so:
#!/usr/bin/env python3
"""
This ia a helper to use the Microsoft SAPI
Needs to run on a windows system
Requires comtypes
"""
import os
import comtypes.client # Importing comtypes.client will make the gen subpackage
try:
from comtypes.gen import SpeechLib # comtypes
except ImportError:
# Generate the SpeechLib lib and any associated files
engine = comtypes.client.CreateObject("SAPI.SpVoice")
stream = comtypes.client.CreateObject("SAPI.SpFileStream")
from comtypes.gen import SpeechLib
class Sapi(object):
"""A speech API using Microsofts SAPI through COM"""
def __init__(self):
super().__init__()
self.voice = comtypes.client.CreateObject('Sapi.SpVoice')
def get_voices(self, name=''):
"""Get a list of voices, search by name optional"""
voice_list = []
voices = self.voice.GetVoices()
if name is not '':
for voice in voices:
if name in voice.GetDescription():
voice_list.append(voice)
break
else:
print('Voice not found')
else:
for voice in voices:
voice_list.append(voice)
return voice_list
def get_voice_names(self):
"""Get the names of all the voices"""
return [voice.GetDescription() for voice in self.get_voices()]
def set_voice(self, voice):
"""Set the voice to the given voice"""
if type(voice) is str:
self.voice.Voice = self.get_voices(voice)[0]
else:
self.voice.Voice = voice
return
def get_audio_outputs(self, name=''):
"""Get the audio outputs, search for the one with the name if given"""
output_list = []
outputs = self.voice.GetAudioOutputs()
if name is not '':
for output in outputs:
if name in output.GetDescription():
output_list.append(output)
break
else:
print('Audio output not found')
else:
for output in outputs:
output_list.append(output)
return output_list
def get_audio_output_names(self):
"""Get the names of all the audio outpus"""
return [output.GetDescription() for output in self.get_audio_outputs()]
def set_audio_output(self, output):
if type(output) is str:
self.voice.AudioOutput = self.get_audio_outputs(output)[0]
else:
self.voice.AudioOutput = output
return
def say(self, message):
self.voice.Speak(message)
return
def set_rate(self, rate):
"""Set the speed of the speaker
-10 is slowest, 10 is fastest"""
self.voice.Rate = rate
def _create_stream(self, filename):
"""Create a file stream handler"""
stream = comtypes.client.CreateObject('Sapi.SpFileStream')
stream.Open(filename, SpeechLib.SSFMCreateForWrite)
return stream
def create_recording(self, filename, message):
"""Make a recording of the given message to the file
The file should be a .wav as the output is
PCM 22050 Hz 16 bit, Little engianness, Signed"""
stream = self._create_stream(filename)
temp_stream = self.voice.AudioOutputStream
self.voice.AudioOutputStream = stream
self.say(message)
self.voice.AudioOutputStream = temp_stream
if __name__ == '__main__':
v = Sapi()
#From here is my code, above is the code from the module I found.
num=9
print("Started!")
print()
for i in range(0,47):
print(num)
print()
words=open("list_words/words"+str(num)+".txt","r")
linhas=words.read().splitlines()
num_total=(len(linhas))
os.mkdir("list_words/list"+str(num))
for e in linhas:
word=""
for i in e:
if i=="/" or i==":" or i=="*" or i=="?" or i=="<" or i==">" or i=="|":
word+=" "
else:
word+=i
v.set_voice("Brian")
v.set_rate(+1)
#I added the try and except way, so every time it can't save the recorded file with its original name, it will save it with 'ERROR - ' before and save the word on a file (backup)
try:
v.create_recording("list_words/list"+str(num)+"/"+word+".wav", e)
except comtypes.COMError:
v.create_recording("list_words/list"+str(num)+"/ERROR - "+word+".wav", e)
erros=open("errors_recorder.txt","a")
erros.write("\n"+word)
num+=1
print("Finished!")
It can now record all the words without interrupting the process. I still have the words.txt file divided, because I didn't finish the recording and I want to make sure that if any error is raised again, I don't have to search everything trying to find it (only in 10.000 words, printing every word).
Maybe this can help anyone else having trouble with this (because I didn't know Windows doesn't allow to save files with specific names).
Thanks anyways.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Custom audio input bytes to azure cognitive speech translation service in Python - python

Related

Get data as dataframe from Look through API sdk package in python

Python for/while loop

Streaming speech recognition with Google Speech-to-Text is leading to improperly timestamped transcripts

how to add transfer syntax uid to the filemeta of dataset

Python - _ctypes.COMError -2147287038 (comtypes module)

Categories

Resources