Google Media Translation API does not show result - python

I am new to Google API and web services. I only tried GoogleTransateAPI once but that one works fine. Now, I want to use Google Media Translation API to translate voice input. I followed the tutorial from https://cloud.google.com/translate/media/docs/streaming.
However, I cannot make it work. There is no error at the run time so I don't know where to look at. Could you please help me identify the problem?
# [START media_translation_translate_from_mic]
from __future__ import division
import itertools
from google.cloud import mediatranslation as media
import pyaudio
from six.moves import queue
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/Me/GoogleMT/TranslationAPI/MediaKey.json"
# Audio recording parametersss
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
SpeechEventType = media.StreamingTranslateSpeechResponse.SpeechEventType
class MicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
self.closed = False
return self
def __exit__(self, type=None, value=None, traceback=None):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def exit(self):
self.__exit__()
def generator(self):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b''.join(data)
def listen_print_loop(responses):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
"""
translation = ''
source = ''
for response in responses:
# Once the transcription settles, the response contains the
# END_OF_SINGLE_UTTERANCE event.
if (response.speech_event_type ==
SpeechEventType.END_OF_SINGLE_UTTERANCE):
print(u'\nFinal translation: {0}'.format(translation))
print(u'Final recognition result: {0}'.format(source))
return 0
result = response.result
translation = result.text_translation_result.translation
source = result.recognition_result
print(u'\nPartial translation: {0}'.format(translation))
print(u'Partial recognition result: {0}'.format(source))
def do_translation_loop():
print('Begin speaking...')
client = media.SpeechTranslationServiceClient()
speech_config = media.TranslateSpeechConfig(
audio_encoding='linear16',
source_language_code='en-US',
target_language_code='ja')
config = media.StreamingTranslateSpeechConfig(
audio_config=speech_config, single_utterance=True)
# The first request contains the configuration.
# Note that audio_content is explicitly set to None.
first_request = media.StreamingTranslateSpeechRequest(
streaming_config=config, audio_content=None)
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
mic_requests = (media.StreamingTranslateSpeechRequest(
audio_content=content,
streaming_config=config)
for content in audio_generator)
requests = itertools.chain(iter([first_request]), mic_requests)
responses = client.streaming_translate_speech(requests)
# Print the translation responses as they arrive
result = listen_print_loop(responses)
if result == 0:
stream.exit()
def main():
while True:
print()
option = input('Press any key to translate or \'q\' to quit: ')
if option.lower() == 'q':
break
do_translation_loop()
if __name__ == '__main__':
main()
# [END media_translation_translate_from_mic]
The result is like this. No translation nor recognition result.
Result screenshot
I was not sure if the problem is with my mic so I tried a similar example code from another Google tutorial to translate an audio file. The result is the same, no recognition result nor translation.
Did I miss something?
Thank you very much.

Related

How to Stream Mp3 music from web using Python PyQt5?

I want to stream mp3 file from web using python PyQt5.I have researched a lot and only found code for streaming wav file.
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtMultimedia import *
import urllib.request
import threading
import time
class Streamer:
def __init__(self,url):
self.url = url
self.fancy = urllib.request.URLopener()
self.web = self.fancy.open(self.url)
self.content_len = int(self.web.headers["Content-Length"])
self.data = self.web.read(1024*1024)
self.buffer = QBuffer()
self.buffer.writeData(self.data[250:])
self.buffer.open(QBuffer.ReadWrite)
threading.Thread(target=self.stream).start()
self.format = QAudioFormat()
self.format.setSampleRate(48000)
self.format.setChannelCount(2)
self.format.setSampleSize(16)
self.format.setCodec("audio/pcm")
self.format.setByteOrder(QAudioFormat.LittleEndian)
self.format.setSampleType(QAudioFormat.SignedInt)
self.audio = QAudioOutput(self.format)
self.audio.start(self.buffer)
def stream(self):
# while True:
# self.sound_data = self.web.read(1024*1024)
# if not self.sound_data:
# break
# self.buffer.buffer().append(self.sound_data)
# time.sleep(2)
while len(self.data) < self.content_len:
self.sound_data = self.web.read(1024*1024)
self.buffer.buffer().append(self.sound_data)
self.data+=self.sound_data
time.sleep(2)
self.buffer.buffer().clear()
del self.data
if __name__ == "__main__":
app = QApplication([])
streamer = Streamer("https://raw.githubusercontent.com/PremKumarMishra/Stream-Songs/main/Audio.wav")
app.exec_()
I checked but cant add MPEG-3(mp3 codec) codec in QAudioFormat.So this current code does not work for mp3.
The basic behavior of QMediaPlayer should be enough to manage buffering of simple audio streams, as the backend will consider the 100% of buffer size as enough to guarantee playing.
In case you want more control over the buffer state, you need to implement a custom QIODevice subclass to act as a middle layer between QMediaPlayer and the download process.
In the following example I'm using QNetworkAccessManager to download the stream, the readyRead signal of the QNetworkReply is then connected to a function that reads the raw bytes, and emits a buffer status considering the current available read data and the minimum size set for the buffer.
The first time the received data has reached the minimum size, it begins to emit the readyRead signal, and if the player has not been started yet (no media set), it sets the media using the Buffer instance, and is then ready to play.
from PyQt5 import QtCore, QtWidgets, QtNetwork, QtMultimedia
url = 'https://url.to/stream'
Errors = {}
for k, v in QtMultimedia.QMediaPlayer.__dict__.items():
if isinstance(v, QtMultimedia.QMediaPlayer.Error):
Errors[v] = k
class Buffer(QtCore.QIODevice):
buffering = QtCore.pyqtSignal(object, object)
fullBufferEmitted = False
def __init__(self, reply, minBufferSize=250000):
super().__init__()
self.minBufferSize = max(200000, minBufferSize)
self.reply = reply
self.data = bytes()
# the network reply is on another thread, use a mutex to ensure that
# no simultaneous access is done in the meantime
self.mutex = QtCore.QMutex()
# this is important!
self.setOpenMode(self.ReadOnly|self.Unbuffered)
self.reply.readyRead.connect(self.dataReceived)
def dataReceived(self):
self.mutex.lock()
self.data += self.reply.readAll().data()
dataLen = len(self.data)
self.mutex.unlock()
self.buffering.emit(dataLen, self.minBufferSize)
if not self.fullBufferEmitted:
if dataLen < self.minBufferSize:
return
self.fullBufferEmitted = True
self.readyRead.emit()
def isSequential(self):
return True
def readData(self, size):
self.mutex.lock()
data = self.data[:size]
self.data = self.data[size:]
self.mutex.unlock()
return data
def bytesAvailable(self):
return len(self.data) + super().bytesAvailable()
class Player(QtWidgets.QWidget):
def __init__(self):
super().__init__()
layout = QtWidgets.QVBoxLayout(self)
self.playButton = QtWidgets.QPushButton('Play', enabled=False)
layout.addWidget(self.playButton)
self.volumeSlider = QtWidgets.QSlider(QtCore.Qt.Horizontal)
layout.addWidget(self.volumeSlider)
self.statusLabel = QtWidgets.QLabel('Waiting')
self.statusLabel.setFrameShape(
self.statusLabel.StyledPanel|self.statusLabel.Sunken)
layout.addWidget(self.statusLabel)
self.player = QtMultimedia.QMediaPlayer(volume=16)
self.volumeSlider.setValue(self.player.volume())
self.networkManager = QtNetwork.QNetworkAccessManager()
self.url = QtCore.QUrl(url)
self.media = QtMultimedia.QMediaContent(self.url)
reply = self.networkManager.get(QtNetwork.QNetworkRequest(self.url))
self.buffer = Buffer(reply)
self.playButton.clicked.connect(self.play)
self.volumeSlider.valueChanged.connect(self.player.setVolume)
self.player.error.connect(self.error)
self.buffer.buffering.connect(self.buffering)
def error(self, error):
errorStr = 'Error: {} ({})'.format(
Errors.get(error, 'Unknown error'), int(error))
self.statusLabel.setText(errorStr)
print(errorStr)
def buffering(self, loaded, minBufferSize):
self.statusLabel.setText('Buffer: {}%'.format(int(loaded / minBufferSize * 100)))
if self.player.media().isNull() and loaded >= minBufferSize:
self.player.setMedia(self.media, self.buffer)
self.playButton.setEnabled(True)
self.playButton.setFocus()
self.statusLabel.setText('Ready to play')
def play(self):
if self.player.state() == self.player.PlayingState:
self.player.pause()
self.playButton.setText('Play')
else:
self.player.play()
self.playButton.setText('Pause')
app = QtWidgets.QApplication([])
w = Player()
w.show()
app.exec_()
Note that:
as soon as QMediaPlayer begins to read the stream, the buffer length will obviously become smaller, as there's no way to know or control how the backend access the stream: when the player is reading (which doesn't mean it's playing), it will read the stream anyway;
due to the reason above, the shown buffer size is only "guessed" as soon as the media is set, based on the data read and the data received from the network reply;
you might want to control the media player status in case the buffer goes too low (but you must consider what explained above), and eventually pause it;

Output of IBM Speech-To-Text

import pyaudio
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from threading import Thread
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
try:
from Queue import Queue, Full
except ImportError:
from queue import Queue, Full
###############################################
#### Initalize queue to store the recordings ##
###############################################
CHUNK = 1024
# Note: It will discard if the websocket client can't consumme fast enough
# So, increase the max size as per your choice
BUF_MAX_SIZE = CHUNK * 10
# Buffer to store audio
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
# Create an instance of AudioSource
audio_source = AudioSource(q, True, True)
###############################################
#### Prepare Speech to Text Service ########
###############################################
# initialize speech to text service
authenticator = IAMAuthenticator('apikey')
speech_to_text = SpeechToTextV1(authenticator=authenticator)
#speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/62a2f19f-959f-4c3c-a276-27ab0e458341/v1/recognize')
speech_to_text.set_service_url('https://stream.watsonplatform.net/speech-to-text/api')
# define callback for the speech to text service
class MyRecognizeCallback(RecognizeCallback):
def __init__(self):
RecognizeCallback.__init__(self)
def on_transcription(self, transcript):
print(transcript)
def on_connected(self):
print('Connection was successful')
def on_error(self, error):
print('Error received: {}'.format(error))
def on_inactivity_timeout(self, error):
print('Inactivity timeout: {}'.format(error))
def on_listening(self):
print('Service is listening')
def on_hypothesis(self, hypothesis):
print(hypothesis)
def on_data(self, data):
print(data)
def on_close(self):
print("Connection closed")
# this function will initiate the recognize service and pass in the AudioSource
def recognize_using_weboscket(*args):
mycallback = MyRecognizeCallback()
speech_to_text.recognize_using_websocket(audio=audio_source,
content_type='audio/l16; rate=44100',
recognize_callback=mycallback,
interim_results=True)
###############################################
#### Prepare the for recording using Pyaudio ##
###############################################
# Variables for recording the speech
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
# define callback for pyaudio to store the recording in queue
def pyaudio_callback(in_data, frame_count, time_info, status):
try:
q.put(in_data)
except Full:
pass # discard
return (None, pyaudio.paContinue)
# instantiate pyaudio
audio = pyaudio.PyAudio()
# open stream using callback
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=pyaudio_callback,
start=False
)
#########################################################################
#### Start the recording and start service to recognize the stream ######
#########################################################################
print("Enter CTRL+C to end recording...")
stream.start_stream()
try:
recognize_thread = Thread(target=recognize_using_weboscket, args=())
recognize_thread.start()
while True:
pass
except KeyboardInterrupt:
# stop recording
stream.stop_stream()
stream.close()
audio.terminate()
audio_source.completed_recording()
This is the code for IBM's Speech-To-Text service using a mic as input. May I know what the output of the program is? This is the output im getting:
Enter CTRL+C to end recording...
Connection was successful
Service is listening
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\websocket\_app.py", line 320, in _callback
callback(self, *args)
File "C:\Users\---\AppData\Local\Programs\Python\Python38-32\lib\site-packages\ibm_watson\websocket\recognize_listener.py", line 199, in on_data
hypothesis = json_object['results'][0]['alternatives'][0][
Connection closed
It suddenly works when I tested with my wireless headset mic. Not sure why though as both devices are functioning well. The output is the transcript in the console.
This is happening to me too and I think the cause of your problem is the audio that you sent to the websocket was probably difficult to recognize, so the websocket's response was none / null, and when the hypothesis function tries to get the answer this the error occurs because the result does not exist.
The output on hypotesis function (def hypotesis ) will be a string with the result of transcript audio file and on data function (def data) will be a json like that:
{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': 'hello ', 'confidence': 0.66}], 'keywords_result': {}}]}

Detecting Activity with mouse, keyboard and voice on windows

We have a module on Python (through win32) to detect user mouse and keyboard activity by GetLastInputInfo and GetTickCount. How can we register Voice activity in GetLastInputInfo?
Or maybe can we add a synthesized input to update GetLastInputInfo every time the mic detects voice input? but can we do that without interrupting the user?
Sample code on Pyaudio to detect user voice by volume:
audio = pyaudio.PyAudio()
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
CHUNK = 1024
# recording prerequisites
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
while True:
data = stream.read(CHUNK)
data_chunk = array('h', data)
vol = max(data_chunk)
if vol >= 500:
# voice detected from mic
print("talking - {}".format(vol))
else:
print("-")
Sample code for detecting user input:
# code to get inactivity
class LastInputInfo(Structure):
_fields_ = [
("cbSize", UINT),
("dwTime", DWORD)
]
def _getLastInputTick() -> int:
"""
retrieves the last input action
:return: int
"""
prototype = WINFUNCTYPE(BOOL, POINTER(LastInputInfo))
paramflags = ((1, "lastinputinfo"), )
# type: ignore
c_GetLastInputInfo = prototype(("GetLastInputInfo", ctypes.windll.user32), paramflags)
l = LastInputInfo()
l.cbSize = ctypes.sizeof(LastInputInfo)
assert 0 != c_GetLastInputInfo(l)
return l.dwTime
def _getTickCount() -> int:
"""
:return: int
tick count
"""
prototype = WINFUNCTYPE(DWORD)
paramflags = ()
c_GetTickCount = prototype(("GetTickCount", ctypes.windll.kernel32), paramflags) # type: ignore
return c_GetTickCount()
def seconds_since_last_input():
"""
:return: float
the time of user input
"""
seconds_since_input = (_getTickCount() - _getLastInputTick()) / 1000
return seconds_since_input
# inactivity in N seconds
seconds_since_input = seconds_since_last_input()
inactive_seconds = 10
while True:
# Becomes active
if afk and seconds_since_input < inactive_seconds:
afk = False
#becomes afk
elif not afk and seconds_since_input >= inactive_seconds:
afk = True
print("afk status: {}, seconds since last input :{}".format(seconds_since_input))
If you want to do something, without interrupting the user, you can use multithreading with threading.
If you want to save something in a variable that every thread can use, you can use queue.
This will run whatever you need to run, in a different thread, and save on a shared variable.
import modules
import threading
import queue
Create a shared variable
shared_var = queue.Queue()
Create a function that checks what you want (in this case audio), and edits the shared variable
Edit shared variable: shared_var.put(item)
(in this case, whenever audio is detected you can say audio_detected.put(True) and/or current_tick_count.put(tick_count), or something like that`)
create a thread and pass in the function you made to check
thread = threading.Thread(target=function, args=arguments)
where target is the function you want to call in this new tread, and args are the arguments you need to pass into your function
Start the new thread
thread.start()
On main thread or a new thread, do what you want with that variable
shared_var.get() will wait until something is added to shared_var and then return what was added.
Example code:
import threading
import queue
import time
text = queue.Queue()
def change(text):
time.sleep(3)
text.put("hello world")
thread = threading.Thread(target=change, args=(text,))
# ^ IMPORTANT! (,)
thread.start()
def display(text):
text = text.get() # This will wait till text has somthing inside and then returns it
print(text)
thread2 = threading.Thread(target=display, args=(text,))
# ^ IMPORTANT! (,)
thread2.start()
input() # To show it won't interrupt the user until the text has something
I am sorry if this answer isn't so clear. I'm not familiar with pyaudio and win32, but I do know threading and queue so you can just work with this and add you're code. If you want you could edit the answer with your code in it.
I hope this helps!

Python: How to decode a mp3 chunk into PCM samples?

I'm trying to catch chunks of an mp3 webstream and decoding them into PCM samples for signal processing. I tried to catch the audio via requests and io.BytesIO to save the data as .wav file.
I have to convert the mp3 data to wav data, but I don't know how. (My goal is not to record a .wav file, i am just doing this to test the algorithm.)
I found the pymedia lib, but it is very old (last commit in 2006), using python 2.7 and for me not installable.
Maybe it is possible with ffmpeg-python, but I have just seen examples using files as input and output.
Here's my code:
import requests
import io
import soundfile as sf
import struct
import wave
import numpy as np
def main():
stream_url = r'http://dg-wdr-http-dus-dtag-cdn.cast.addradio.de/wdr/1live/diggi/mp3/128/stream.mp3'
r = requests.get(stream_url, stream=True)
sample_array = []
try:
for block in r.iter_content(1024):
data, samplerate = sf.read(io.BytesIO(block), format="RAW", channels=2, samplerate=44100, subtype='FLOAT',
dtype='float32')
sample_array = np.append(sample_array, data)
except KeyboardInterrupt:
print("...saving")
obj = wave.open('sounds/stream1.wav', 'w')
obj.setnchannels(1) # mono
obj.setsampwidth(2) # bytes
obj.setframerate(44100)
data_max = np.nanmax(abs(sample_array))
# fill WAV with samples from sample_array
for sample in sample_array:
if (np.isnan(sample) or np.isnan(32760 * sample / data_max)) is True:
continue
try:
value = int(32760 * sample / data_max) # normalization INT16
except ValueError:
value = 1
finally:
data = struct.pack('<h', value)
obj.writeframesraw(data)
obj.close()
print("end")
if __name__ == '__main__':
main()
Do you have an idea how to handle this problem?
You are missing the decoding of mp3 stream. You are just saving mp3 file as wav.
You first need to decode mp3 audio. Which will give you PCM samples + audio info.
With the help from Irmen and his "miniaudio" and "synthesizer" library, I could solve the problem.
The problem was, that most radio webstreams uses the ICECAST protocol, which includes interleaved metadata information, so you can't decode it directly.
With the example script https://github.com/irmen/synthesizer/blob/master/examples/internetradio.py as template, I could write a script, which records a webstream until KeyboardInterrupt and saves it as a .wav file.
Here's the main part I edited:
...
def _audio_playback(self, pcm_stream):
sample_array = None
with Output(mixing="sequential", frames_per_chunk=44100 // 4) as output:
print("begin recording")
while self.decode_flag:
try:
audio = pcm_stream.read(44100 * 2 * 2 // 20)
if not audio:
break
except (IOError, ValueError):
break
else:
sample = Sample.from_raw_frames(audio, 2, 44100, 2)
if sample_array is None:
sample_array = sample.get_frames_numpy_float()
else:
sample_array = np.append(sample_array, sample.get_frames_numpy_float(), axis=0)
print("...saving")
wavf.write(self.file_location, 44100, sample_array)
print("saved")
...
Based on Bendzko answer here is my code:
pip install pyaudio miniaudio
import threading
import urllib.request
import time
try:
import miniaudio
except ImportError:
miniaudio = None
import pyaudio
import ctypes
import sys
CHUNK = 4096
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,channels=2,rate=44100,output=True)
class RadioThread(threading.Thread):
def run(self):
self.url = "https://impradio.bytemasters.gr/8002/stream"
#run in threading
client = miniaudio.IceCastClient(self.url)
pcm_stream = MiniaudioDecoderPcmStream(client.audio_format,client)
self.audio_playback(pcm_stream)
def audio_playback(self,pcm_stream):
global stop_peradio_thread
while stop_peradio_thread==False:
try:
audio = pcm_stream.read(CHUNK)
stream.write(audio.tobytes())
except:
pass
class MiniaudioDecoderPcmStream(miniaudio.StreamableSource):
def __init__(self, fmt, stream):
self.pcm_stream = miniaudio.stream_any(stream, fmt, dither=miniaudio.DitherMode.TRIANGLE)
def read(self, size):
try:
return self.pcm_stream.send(size)
except StopIteration:
return b""
def main():
global stop_peradio_thread
stop_peradio_thread = False
t1 = RadioThread()
t1.start()
while True:
try:
time.sleep(1)
except KeyboardInterrupt:
stop_peradio_thread = True
t1.join()
sys.exit()
main()

pyaudio - "Listen" until voice is detected and then record to a .wav file

I'm having some problems and I cannot seem to get my head around the concept.
What I am trying to do is this:
Have the microphone "listen" for voiced (above a particular threshold) and then start recording to a .wav file until the person has stopped speaking / the signal is no longer there. For example:
begin:
listen() -> nothing is being said
listen() -> nothing is being said
listen() -> VOICED - _BEGIN RECORDING_
listen() -> VOICED - _BEGIN RECORDING_
listen() -> UNVOICED - _END RECORDING_
end
I want to do this also using "threading" so a thread would be created that "listens" to the file constantly, and, another thread will begin when there is voiced data.. But, I cannot for the life of me figure out how I should go about it.. Here is my code so far:
import wave
import sys
import threading
from array import array
from sys import byteorder
try:
import pyaudio
CHECK_PYLIB = True
except ImportError:
CHECK_PYLIB = False
class Audio:
_chunk = 0.0
_format = 0.0
_channels = 0.0
_rate = 0.0
record_for = 0.0
stream = None
p = None
sample_width = None
THRESHOLD = 500
# initial constructor to accept params
def __init__(self, chunk, format, channels, rate):
#### set data-types
self._chunk = chunk
self.format = pyaudio.paInt16,
self.channels = channels
self.rate = rate
self.p = pyaudio.PyAudio();
def open(self):
# print "opened"
self.stream = self.p.open(format=pyaudio.paInt16,
channels=2,
rate=44100,
input=True,
frames_per_buffer=1024);
return True
def record(self):
# create a new instance/thread to record the sound
threading.Thread(target=self.listen).start();
def is_silence(snd_data):
return max(snd_data) < THRESHOLD
def listen(self):
r = array('h')
while True:
snd_data = array('h', self.stream.read(self._chunk))
if byteorder == 'big':
snd_data.byteswap()
r.extend(snd_data)
return sample_width, r
I'm guessing that I could record "5" second blocks, and, then if the block is deemed as "voiced" then it the thread should be started until all the voice data has been captured. However, because at current it's at while True: i don't want to capture all of the audio up until there are voiced commands, so e.g. "no voice", "no voice", "voice", "voice", "no voice", "no voice" i just want the "voice" inside the wav file.. Anyone have any suggestions?
Thank you
EDIT:
import wave
import sys
import time
import threading
from array import array
from sys import byteorder
from Queue import Queue, Full
import pyaudio
CHUNK_SIZE = 1024
MIN_VOLUME = 500
BUF_MAX_SIZE = 1024 * 10
process_g = 0
def main():
stopped = threading.Event()
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK_SIZE)))
listen_t = threading.Thread(target=listen, args=(stopped, q))
listen_t.start()
process_g = threading.Thread(target=process, args=(stopped, q))
process_g.start()
try:
while True:
listen_t.join(0.1)
process_g.join(0.1)
except KeyboardInterrupt:
stopped.set()
listen_t.join()
process_g.join()
def process(stopped, q):
while True:
if stopped.wait(timeout = 0):
break
print "I'm processing.."
time.sleep(300)
def listen(stopped, q):
stream = pyaudio.PyAudio().open(
format = pyaudio.paInt16,
channels = 2,
rate = 44100,
input = True,
frames_per_buffer = 1024
)
while True:
if stopped and stopped.wait(timeout=0):
break
try:
print process_g
for i in range(0, int(44100 / 1024 * 5)):
data_chunk = array('h', stream.read(CHUNK_SIZE))
vol = max(data_chunk)
if(vol >= MIN_VOLUME):
print "WORDS.."
else:
print "Nothing.."
except Full:
pass
if __name__ == '__main__':
main()
Now, after every 5 seconds, I need the "process" function to execute, and then process the data (time.delay(10) whilst it does this and then start the recording back up..
Having spent some time on it, I've come up with the following code that seems to be doing what you need, except writing to file:
import threading
from array import array
from Queue import Queue, Full
import pyaudio
CHUNK_SIZE = 1024
MIN_VOLUME = 500
# if the recording thread can't consume fast enough, the listener will start discarding
BUF_MAX_SIZE = CHUNK_SIZE * 10
def main():
stopped = threading.Event()
q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK_SIZE)))
listen_t = threading.Thread(target=listen, args=(stopped, q))
listen_t.start()
record_t = threading.Thread(target=record, args=(stopped, q))
record_t.start()
try:
while True:
listen_t.join(0.1)
record_t.join(0.1)
except KeyboardInterrupt:
stopped.set()
listen_t.join()
record_t.join()
def record(stopped, q):
while True:
if stopped.wait(timeout=0):
break
chunk = q.get()
vol = max(chunk)
if vol >= MIN_VOLUME:
# TODO: write to file
print "O",
else:
print "-",
def listen(stopped, q):
stream = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=2,
rate=44100,
input=True,
frames_per_buffer=1024,
)
while True:
if stopped.wait(timeout=0):
break
try:
q.put(array('h', stream.read(CHUNK_SIZE)))
except Full:
pass # discard
if __name__ == '__main__':
main()
Look here:
https://github.com/jeysonmc/python-google-speech-scripts/blob/master/stt_google.py
It even converts Wav to flac and sends it to the google Speech api , just delete the stt_google_wav function if you dont need it ;)

Categories

Resources