pyttsx3 makes pauses evere 82 words - python

I am learning to use pyttsx3 module.
I want to make a continuous speech without any pauses so I deleted all '.' and ',', but pauses are created in what I thought were random places. So dug deeper and figured that it makes a pause every 82 (+/-1 words).
Any idea how to fix it?
Here is my code:
import pyttsx3
with open('D:\D-Chilldom\QuickMovieRecap\Movie.txt', 'r') as f:
text = f.read()
text = text.replace('.', '')
text = text.replace(',', '')
# Initialize the TTS engine
engine = pyttsx3.init()
# Set the rate of speech (words per minute)
rate = 250
engine.setProperty('rate', rate)
# Set the volume of the voice
volume = 1
engine.setProperty('volume', volume)
# Set the voice to use
voice_id = "com.apple.speech.synthesis.voice.samantha"
engine.setProperty('voice', voice_id)
engine.save_to_file(text, 'D:/D-Chilldom/QuickMovieRecap/rec/zzzz.mp3')
engine.runAndWait()

Related

Change languag of text to speech

I want to change the voice of azure from python, with these characteristics
languageCode = 'es‑MX'
ssmlGender = 'FEMALE'
voicName = 'es‑MX‑DaliaNeural'
but i'm new to azure so i don't know how, this is my code:
import PyPDF2
import azure.cognitiveservices.speech as sdk
key = "fake key"
region = "fake region"
config = sdk.SpeechConfig(subscription=key, region=region)
synthesizer = sdk.SpeechSynthesizer(speech_config=config)
book = open("prueba.pdf", "rb")
reader = PyPDF2.PdfFileReader(book)
for num in range(0,reader.numPages):
text = reader.getPage(num).extractText()
result = synthesizer.speak_text_async(text).get()
Acording to the documentation https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-speech-synthesis?tabs=browserjs%2Cterminal&pivots=programming-language-python#select-synthesis-language-and-voice you should be able to do:
config.speech_synthesis_language = "es‑MX"
config.speech_synthesis_voice_name ="es-MX-DaliaNeural"
The list of voices is here https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=tts

How to display text on the screen it is said over the audio

As a personal project, I decided to create one of the reddit text-to-speech bot.
I pulled all the data from reddit with praw
import praw, random
def scrapeData(subredditName):
# Instantiate praw
reddit = praw.Reddit()
# Get subreddit
subreddit = reddit.subreddit(subredditName)
# Get a bunch of posts and convert them into a list
posts = list(subreddit.new(limit=100))
# Get random number
randomNumber = random.randint(0, 100)
# Store post's title and description in variables
postTitle = posts[randomNumber].title
postDesc = posts[randomNumber].selftext
return postTitle + " " + postDesc
Then, I converted it to speech stored in a .mp3 file with gTTS.
from google.cloud import texttospeech
def convertTextToSpeech(textString):
# Instantiate TTS
client = texttospeech.TextToSpeechClient().from_service_account_json("path/to/json")
# Set text input to be synthesized
synthesisInput = texttospeech.SynthesisInput(text=textString)
# Build the voice request
voice = texttospeech.VoiceSelectionParams(language_code = "en-us",
ssml_gender = texttospeech.SsmlVoiceGender.MALE)
# Select the type of audio file
audioConfig = texttospeech.AudioConfig(audio_encoding =
texttospeech.AudioEncoding.MP3)
# Perform the TTS request on the text input
response = client.synthesize_speech(input = synthesisInput, voice =
voice, audio_config= audioConfig)
# Convert from binary to mp3
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
I've created an .mp4 with moviepy that has generic footage in the background with the audio synced over it,
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip
# get vide and audio source files
clip = VideoFileClip("background.mp4").subclip(20,30)
audio = AudioFileClip("output.mp3").subclip(0, 10)
# Set audio and create final video
videoClip = clip.set_audio(audio)
videoClip.write_videofile("output.mp4")
but my issue is I can't find a way to have only the current word or sentence displayed on screen as a subtitle, rather than the entire post.

How to integrate Azure text to speech with streamlit?

I am trying to integrate azure text to speech with streamlit.
import azure.cognitiveservices.speech as speechsdk
import streamlit as st
st.title("Let's learn Math!")
def recognize_from_microphone():
speech_config = speechsdk.SpeechConfig(subscription="743ae1f5555f49f9a5de4457d4e91b2d", region="australiaeast")
speech_config.speech_recognition_language="en-US"
#To recognize speech from an audio file, use `filename` instead of `use_default_microphone`:
#audio_config = speechsdk.audio.AudioConfig(filename="YourAudioFile.wav")
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
st.text("Speak into your microphone.")
speech_recognition_result = speech_recognizer.recognize_once_async().get()
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
st.text("Recognized: {}".format(speech_recognition_result.text))
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
st.text("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
st.text("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
st.text("Error details: {}".format(cancellation_details.error_details))
st.text("Did you set the speech resource key and region values?")
text = st.text_input("Enter text", value="Hi", max_chars=5)
def audio_output(text):
speech_config = speechsdk.SpeechConfig(subscription="743ae1f5555f49f9a5de4457d4e91b2d", region="australiaeast")
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
# The language of the voice that speaks.
speech_config.speech_synthesis_voice_name='en-US-JennyNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
# Get text from the console and synthesize to the default speaker.
st.write("Enter some text that you want to speak >")
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
st.write("Speech synthesized for text [{}]".format(text))
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
st.write("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
st.write("Error details: {}".format(cancellation_details.error_details))
st.write("Did you set the speech resource key and region values?")
recognize_from_microphone()
audio_output(text)
This is my code, but streamlit is not loading the functions at all. Is there any fix? I am new to streamlit and azure.
You declared the functions but didn't called them.

How to change volume of stem files while playing using python

I'm attempting to write a python project that plays multiple parts of a song at the same time.
For background information, a song is split into "stems", and then each stem is played simultaneously to recreate the full song. What I am trying to achieve is using potentiometers to control the volume of each stem, so that the user can mix songs differently. For a product relation, the StemPlayer from Kanye West is what I am trying to achieve.
I can change the volume of the overlayed song at the end, but what I want to do is change the volume of each stem using a potentiometer while the song is playing. Is this even possible using pyDub? Below is the code I have right now.
from pydub import AudioSegment
from pydub.playback import play
vocals = AudioSegment.from_file("walkin_vocals.mp3")
drums = AudioSegment.from_file("walkin_drums.mp3")
bass = AudioSegment.from_file("walkin_bass.mp3")
vocalsDrums = vocals.overlay(drums)
bassVocalsDrums = vocalsDrums.overlay(bass)
songQuiet = bassVocalsDrums - 20
play(songQuiet)
Solved this question, I ended up using pyaudio instead of pydub.
With pyaudio, I was able to define a custom stream_callback function. Within this callback function, I multiply each stem by a modifier, then add each stem to one audio output.
def callback(in_data, frame_count, time_info, status):
global drumsMod, vocalsMod, bassMod, otherMod
drums = drumsWF.readframes(frame_count)
vocals = vocalsWF.readframes(frame_count)
bass = bassWF.readframes(frame_count)
other = otherWF.readframes(frame_count)
decodedDrums = numpy.frombuffer(drums, numpy.int16)
decodedVocals = numpy.frombuffer(vocals, numpy.int16)
decodedBass = numpy.frombuffer(bass, numpy.int16)
decodedOther = numpy.frombuffer(other, numpy.int16)
newdata = (decodedDrums*drumsMod + decodedVocals*vocalsMod + decodedBass*bassMod + decodedOther*otherMod).astype(numpy.int16)
return (newdata.tobytes(), pyaudio.paContinue)

DialoGPT output using pyttsx3?

I want to use DialoGPT to have a conversation with me using a microphone and speaker.
However, before I get there, I want to begin by somehow connecting this chat bot to a speaker using pyttsx3. I am able to properly use pyttsx3 to produce sound out of my computer speakers using pre inserted text (i.e. "hello world").
I am also able to use DialoGPT to have a conversation using the terminal.
Therein lies the first issue of connecting the two. I have pasted my current code and was wondering if someone could offer me some assistance.
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pyttsx3
engine = pyttsx3.init()
# Set properties before adding
# Things to say
# Sets speed percent
# Can be more than 100
engine.setProperty('rate',190)
# Set volume 0-1
engine.setProperty('volume', 0.7)
# Set ID of voice
voice_id = "com.apple.speech.synthesis.voice.karen"
# Use female voice
engine.setProperty('voice', voice_id)
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
# Let's chat for 5 lines
for step in range(5):
# encode the new user input, add the eos_token and return a tensor in Pytorch
new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
# append the new user input tokens to the chat history
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens,
chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
# pretty print last ouput tokens from bot
engine.say("DialoGPT: {}").format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
Solved~
import pyttsx3
engine = pyttsx3.init()
# Set properties before adding
# Things to say
# Sets speed percent
# Can be more than 100
engine.setProperty('rate',190)
# Set volume 0-1
engine.setProperty('volume', 0.7)
# Set ID of voice
voice_id = "com.apple.speech.synthesis.voice.karen"
# Use female voice
engine.setProperty('voice', voice_id)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
# Let's chat for 3 lines
for step in range(3):
# encode the new user input, add the eos_token and return a tensor in Pytorch
new_user_input_ids = tokenizer.encode(input("User:") + tokenizer.eos_token, return_tensors='pt')
# append the new user input tokens to the chat history
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens,
chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
# pretty print last ouput tokens from bot
str = ("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
engine.say(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
print(str)
engine.runAndWait()

Categories

Resources