DialoGPT output using pyttsx3? - python

I want to use DialoGPT to have a conversation with me using a microphone and speaker.
However, before I get there, I want to begin by somehow connecting this chat bot to a speaker using pyttsx3. I am able to properly use pyttsx3 to produce sound out of my computer speakers using pre inserted text (i.e. "hello world").
I am also able to use DialoGPT to have a conversation using the terminal.
Therein lies the first issue of connecting the two. I have pasted my current code and was wondering if someone could offer me some assistance.
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pyttsx3
engine = pyttsx3.init()
# Set properties before adding
# Things to say
# Sets speed percent
# Can be more than 100
engine.setProperty('rate',190)
# Set volume 0-1
engine.setProperty('volume', 0.7)
# Set ID of voice
voice_id = "com.apple.speech.synthesis.voice.karen"
# Use female voice
engine.setProperty('voice', voice_id)
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
# Let's chat for 5 lines
for step in range(5):
# encode the new user input, add the eos_token and return a tensor in Pytorch
new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
# append the new user input tokens to the chat history
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens,
chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
# pretty print last ouput tokens from bot
engine.say("DialoGPT: {}").format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))

Solved~
import pyttsx3
engine = pyttsx3.init()
# Set properties before adding
# Things to say
# Sets speed percent
# Can be more than 100
engine.setProperty('rate',190)
# Set volume 0-1
engine.setProperty('volume', 0.7)
# Set ID of voice
voice_id = "com.apple.speech.synthesis.voice.karen"
# Use female voice
engine.setProperty('voice', voice_id)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
# Let's chat for 3 lines
for step in range(3):
# encode the new user input, add the eos_token and return a tensor in Pytorch
new_user_input_ids = tokenizer.encode(input("User:") + tokenizer.eos_token, return_tensors='pt')
# append the new user input tokens to the chat history
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
# generated a response while limiting the total chat history to 1000 tokens,
chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
# pretty print last ouput tokens from bot
str = ("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
engine.say(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
print(str)
engine.runAndWait()

Related

Change languag of text to speech

I want to change the voice of azure from python, with these characteristics
languageCode = 'es‑MX'
ssmlGender = 'FEMALE'
voicName = 'es‑MX‑DaliaNeural'
but i'm new to azure so i don't know how, this is my code:
import PyPDF2
import azure.cognitiveservices.speech as sdk
key = "fake key"
region = "fake region"
config = sdk.SpeechConfig(subscription=key, region=region)
synthesizer = sdk.SpeechSynthesizer(speech_config=config)
book = open("prueba.pdf", "rb")
reader = PyPDF2.PdfFileReader(book)
for num in range(0,reader.numPages):
text = reader.getPage(num).extractText()
result = synthesizer.speak_text_async(text).get()
Acording to the documentation https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-speech-synthesis?tabs=browserjs%2Cterminal&pivots=programming-language-python#select-synthesis-language-and-voice you should be able to do:
config.speech_synthesis_language = "es‑MX"
config.speech_synthesis_voice_name ="es-MX-DaliaNeural"
The list of voices is here https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=tts

pyttsx3 makes pauses evere 82 words

I am learning to use pyttsx3 module.
I want to make a continuous speech without any pauses so I deleted all '.' and ',', but pauses are created in what I thought were random places. So dug deeper and figured that it makes a pause every 82 (+/-1 words).
Any idea how to fix it?
Here is my code:
import pyttsx3
with open('D:\D-Chilldom\QuickMovieRecap\Movie.txt', 'r') as f:
text = f.read()
text = text.replace('.', '')
text = text.replace(',', '')
# Initialize the TTS engine
engine = pyttsx3.init()
# Set the rate of speech (words per minute)
rate = 250
engine.setProperty('rate', rate)
# Set the volume of the voice
volume = 1
engine.setProperty('volume', volume)
# Set the voice to use
voice_id = "com.apple.speech.synthesis.voice.samantha"
engine.setProperty('voice', voice_id)
engine.save_to_file(text, 'D:/D-Chilldom/QuickMovieRecap/rec/zzzz.mp3')
engine.runAndWait()

How to display text on the screen it is said over the audio

As a personal project, I decided to create one of the reddit text-to-speech bot.
I pulled all the data from reddit with praw
import praw, random
def scrapeData(subredditName):
# Instantiate praw
reddit = praw.Reddit()
# Get subreddit
subreddit = reddit.subreddit(subredditName)
# Get a bunch of posts and convert them into a list
posts = list(subreddit.new(limit=100))
# Get random number
randomNumber = random.randint(0, 100)
# Store post's title and description in variables
postTitle = posts[randomNumber].title
postDesc = posts[randomNumber].selftext
return postTitle + " " + postDesc
Then, I converted it to speech stored in a .mp3 file with gTTS.
from google.cloud import texttospeech
def convertTextToSpeech(textString):
# Instantiate TTS
client = texttospeech.TextToSpeechClient().from_service_account_json("path/to/json")
# Set text input to be synthesized
synthesisInput = texttospeech.SynthesisInput(text=textString)
# Build the voice request
voice = texttospeech.VoiceSelectionParams(language_code = "en-us",
ssml_gender = texttospeech.SsmlVoiceGender.MALE)
# Select the type of audio file
audioConfig = texttospeech.AudioConfig(audio_encoding =
texttospeech.AudioEncoding.MP3)
# Perform the TTS request on the text input
response = client.synthesize_speech(input = synthesisInput, voice =
voice, audio_config= audioConfig)
# Convert from binary to mp3
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
I've created an .mp4 with moviepy that has generic footage in the background with the audio synced over it,
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip
# get vide and audio source files
clip = VideoFileClip("background.mp4").subclip(20,30)
audio = AudioFileClip("output.mp3").subclip(0, 10)
# Set audio and create final video
videoClip = clip.set_audio(audio)
videoClip.write_videofile("output.mp4")
but my issue is I can't find a way to have only the current word or sentence displayed on screen as a subtitle, rather than the entire post.

How to change volume of stem files while playing using python

I'm attempting to write a python project that plays multiple parts of a song at the same time.
For background information, a song is split into "stems", and then each stem is played simultaneously to recreate the full song. What I am trying to achieve is using potentiometers to control the volume of each stem, so that the user can mix songs differently. For a product relation, the StemPlayer from Kanye West is what I am trying to achieve.
I can change the volume of the overlayed song at the end, but what I want to do is change the volume of each stem using a potentiometer while the song is playing. Is this even possible using pyDub? Below is the code I have right now.
from pydub import AudioSegment
from pydub.playback import play
vocals = AudioSegment.from_file("walkin_vocals.mp3")
drums = AudioSegment.from_file("walkin_drums.mp3")
bass = AudioSegment.from_file("walkin_bass.mp3")
vocalsDrums = vocals.overlay(drums)
bassVocalsDrums = vocalsDrums.overlay(bass)
songQuiet = bassVocalsDrums - 20
play(songQuiet)
Solved this question, I ended up using pyaudio instead of pydub.
With pyaudio, I was able to define a custom stream_callback function. Within this callback function, I multiply each stem by a modifier, then add each stem to one audio output.
def callback(in_data, frame_count, time_info, status):
global drumsMod, vocalsMod, bassMod, otherMod
drums = drumsWF.readframes(frame_count)
vocals = vocalsWF.readframes(frame_count)
bass = bassWF.readframes(frame_count)
other = otherWF.readframes(frame_count)
decodedDrums = numpy.frombuffer(drums, numpy.int16)
decodedVocals = numpy.frombuffer(vocals, numpy.int16)
decodedBass = numpy.frombuffer(bass, numpy.int16)
decodedOther = numpy.frombuffer(other, numpy.int16)
newdata = (decodedDrums*drumsMod + decodedVocals*vocalsMod + decodedBass*bassMod + decodedOther*otherMod).astype(numpy.int16)
return (newdata.tobytes(), pyaudio.paContinue)

Is there a preferred method of saving and loading h2o word2vec models in python?

I have trained a word2vec model in the python h2o package.
Is there a simple way for me to save that word2vec model and load it back later for use?
I have tried the h2o.save_model() and h2o.load_model() functions with no luck.
I get an error using that approach like
ERROR: Unexpected HTTP Status code: 412 Precondition Failed (url = http://localhost:54321/99/Models.bin/)
water.exceptions.H2OIllegalArgumentException
[1] "water.exceptions.H2OIllegalArgumentException: Illegal argument: dir of function: importModel:
I am using the same version of h2o to train and load the model back in so the issue outlined in this question is not applicable Can't import binay h2o model with h2o.loadModel() function: 412 Precondition Failed
Any one with any insights on how to save and load an h2o word2vec model?
My sample code with a few of the important snippets
import h2o
from h2o.estimators import H2OWord2vecEstimator
df['text'] = df['text'].ascharacter()
# Break text into sequence of words
words = tokenize(df["text"])
# Initializing h2o
print('Initializing h2o.')
h2o.init(ip=h2o_ip, port=h2o_port, min_mem_size=h2o_min_memory)
# Build word2vec model:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)
# Calculate a vector for each row
word_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
#Save model to path
wv_path = '/models/wordvec/'
model_path = h2o.save_model(model = w2v_model, path= wv_path ,force=True)
# Load model in later script
w2v_model = h2o.load_model(model_path)
It sounds like you might have an access issue with the directory you trying to read from. I just tested on H2O 3.30.0.1 following the w2v example from docs and it ran fine:
job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"),
col_names = ["category", "jobtitle"],
col_types = ["string", "string"],
header = 1)
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can",
"lines","re","what","there","all","we","one","the",
"a","an","of","or","in","for","by","on","but","is",
"in","a","not","with","as","was","if","they","are",
"this","and","it","have","from","at","my","be","by",
"not","that","to","from","com","org","like","likes",
"so"]
# Make the 'tokenize' function:
def tokenize(sentences, stop_word = STOP_WORDS):
tokenized = sentences.tokenize("\\W+")
tokenized_lower = tokenized.tolower()
tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
return tokenized_words
# Break job titles into a sequence of words:
words = tokenize(job_titles["jobtitle"])
# Build word2vec model:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)
#Save model
wv_path = 'models/'
model_path = h2o.save_model(model = w2v_model, path= wv_path ,force=True)
#Load Model
w2v_model2 = h2o.load_model(model_path)

Categories

Resources