I am attempting to use the speech recognition toolkit VOSK and the speech diarization package Resemblyzer to transcibe audio and then identify the speakers in the audio.
Tools:
https://github.com/alphacep/vosk-api
https://github.com/resemble-ai/Resemblyzer
I can do both things individually but run into issues when trying to do them when running the one python script.
I used the following guide when setting up the diarization system:
https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279
Computer specs are as follows:
Intel(R) Core(TM) i3-7100 CPU # 3.90GHz, 3912 Mhz, 2 Core(s), 4 Logical Processor(s)
32GB RAM
The following is my code, I am not to sure if using threading is appropriate or if I even implemented it correctly, how can I best optimize this code as to achieve the results I am looking for and not crash.
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import json
import sys
import os
import subprocess
import datetime
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from resemblyzer.hparams import sampling_rate
from spectralcluster import SpectralClusterer
import threading
import queue
import gc
def recognition(queue, audio, FRAME_RATE):
model = Model("Vosk_Models/vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)
rec.AcceptWaveform(audio.raw_data)
result = rec.Result()
transcript = json.loads(result)#["text"]
#return transcript
queue.put(transcript)
def diarization(queue, audio):
wav = preprocess_wav(audio)
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)
print(cont_embeds.shape)
clusterer = SpectralClusterer(
min_clusters=2,
max_clusters=100,
p_percentile=0.90,
gaussian_blur_sigma=1)
labels = clusterer.predict(cont_embeds)
def create_labelling(labels, wav_splits):
times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
labelling = []
start_time = 0
for i, time in enumerate(times):
if i > 0 and labels[i] != labels[i - 1]:
temp = [str(labels[i - 1]), start_time, time]
labelling.append(tuple(temp))
start_time = time
if i == len(times) - 1:
temp = [str(labels[i]), start_time, time]
labelling.append(tuple(temp))
return labelling
#return
labelling = create_labelling(labels, wav_splits)
queue.put(labelling)
def identify_speaker(queue1, queue2):
transcript = queue1.get()
labelling = queue2.get()
for speaker in labelling:
speakerID = speaker[0]
speakerStart = speaker[1]
speakerEnd = speaker[2]
result = transcript['result']
words = [r['word'] for r in result if speakerStart < r['start'] < speakerEnd]
#return
print("Speaker",speakerID,":",' '.join(words), "\n")
def main():
queue1 = queue.Queue()
queue2 = queue.Queue()
FRAME_RATE = 16000
CHANNELS = 1
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
podcast = podcast.set_channels(CHANNELS)
podcast = podcast.set_frame_rate(FRAME_RATE)
first_thread = threading.Thread(target=recognition, args=(queue1, podcast, FRAME_RATE))
second_thread = threading.Thread(target=diarization, args=(queue2, podcast))
third_thread = threading.Thread(target=identify_speaker, args=(queue1, queue2))
first_thread.start()
first_thread.join()
gc.collect()
second_thread.start()
second_thread.join()
gc.collect()
third_thread.start()
third_thread.join()
gc.collect()
# transcript = recognition(podcast,FRAME_RATE)
#
# labelling = diarization(podcast)
#
# print(identify_speaker(transcript, labelling))
if __name__ == '__main__':
main()
When I say crash I mean everything freezes, I have to hold down the power button on the desktop and turn it back on again. No blue/blank screen, just frozen in my IDE looking at my code. Any help in resolving this issue would be greatly appreciated.
Pydubs AudioSegment was not returning a suitable type for the Resembylzer function preprocess_wav.
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
preprocess_wav instead requires a Numpy Array / Path.
audio_file_path = 'Podcast_Audio/WAV-Film-Release-Clip.wav'
wav_fpath = Path(audio_file_path)
wav = preprocess_wav(wav_fpath)
Additionally preprocess_wav functionality can be achieved using Librosa if desired.
import librosa
def preprocess_wav(waveform, sr):
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
waveform = waveform.astype(np.float32) / np.max(np.abs(waveform))
return waveform
waveform, sr = librosa.load('Podcast_Audio/WAV-Film-Release-Clip.wav')
wav = preprocess_wav(waveform, sr)
Related
Detect beep sound from Audio file using python
this code I found from somewhere but it is not giving the actual beep result
means in the audio where beep is not there then also this code is showing beep in it
from moviepy.editor import *
import matplotlib.pyplot as plt
import cv2
#from time import sleep
import sounddevice as sd
from scipy.io import wavfile
import numpy as np
filename = 'C:/Users/YahyaSirguroh/Downloads/output.mp4'
video = VideoFileClip(filename)
audio = video.audio
duration = video.duration
audio.write_audiofile("audio.wav")
#sleep(0.3)
samplerate, data = wavfile.read('audio.wav')
step = 30
audio_signal = []
cnt = 0
flag = 0
text = ''
for t in range(int(duration*step)):
t = t/step
if cnt:
flag+=1
if t > audio.duration or t > video.duration: break
audio_frame = audio.get_frame(t) #numpy array representing mono/stereo values
audio_signal.extend(list(audio_frame))
if (audio_frame>0.6).sum()==2:
cnt+=1
if cnt>=2:
print('beep detected at %5.2f' %(t))
text = 'beep detected at %d' %(np.round(t))
if flag>=4:
cnt=0
flag=0
I am using streamlit
I want to prepare audio dataset, so for that I have a .csv file which contains n numbers of lines which should be spoken by a user and voice of that user should be saved in a folder
Structure of CSV FILE
Sr#
Word
1
Hello
2
World
this file should be loaded and one-by-one all the words should be shown and below that word user should record sound and then press save button to save that audio and then next word appear. I have written the code but it is not working properly
import re
from turtle import onclick
from zlib import DEFLATED
from requests import session
from streamlit_option_menu import option_menu
import streamlit as st
import pandas as pd
import os
import sounddevice as sd
import wavio
def record(duration=5, fs=48000):
sd.default.samplerate = fs
sd.default.channels = 1
myrecording = sd.rec(int(duration * fs))
sd.wait(duration)
return myrecording
def save_record(path_myrecording, myrecording, fs):
wavio.write(path_myrecording, myrecording, fs, sampwidth=2)
return None
def read_audio(file):
with open(file, "rb") as audio_file:
audio_bytes = audio_file.read()
return audio_bytes
st.header("Data Pipeline")
with st.sidebar:
choose = option_menu("Modules", ["1. Recording","2. Preprocess Data","3. Create
Dataset"],
icons=['headset', 'mic', 'clipboard'],
default_index=0)
my_list=['a','b']
if choose=="1. Recording":
st.header("Voice Recording")
st.text("Follow the following steps")
s_name=''
s_name= st.text_input("1. Enter Your Name")
if os.path.isdir("./Recordings/"+s_name):
pass
else:
os.mkdir("./Recordings/"+s_name)
file =st.file_uploader("2. Upload an csv file", type=
['csv'],accept_multiple_files=False,key='file')
# if file:
# df= pd.read_csv(file)
# for i in df['Urdu']:
# my_list.append(i)
show_next = st.button("next", disabled = False)
if "current_index" not in st.session_state:
st.session_state.current_index = 0
st.session_state.load_state=True
# Whenever someone clicks on the button
if show_next and st.session_state.load_state :
# Show next element in list
st.write(my_list[st.session_state.current_index])
Record=st.button('Record', key='rec')
if st.session_state.get('rec')!=True:
st.session_state['rec']=True
if st.session_state['rec']==True:
print('i am here')
record_state = st.text("Recording...")
duration = 1 # seconds
fs = 48000
myrecording = record(duration, fs)
path_myrecording = f"./temp/AMAD.wav"
save_record(path_myrecording, myrecording, fs)
st.audio(read_audio(path_myrecording))
path="./Recordings/"+s_name+"/1.wav"
os.remove(path_myrecording)
save_record(path, myrecording, fs)
st.success("Recording Saved")
# Update and store the index
st.session_state.current_index += 1
if len(my_list) == st.session_state.current_index :
st.session_state.load_state = False
st.session_state
I am using this library midi2img to generate midi from images
From this library, this is the file am using:
from PIL import Image
import numpy as np
from music21 import instrument, note, chord, stream
lowerBoundNote = 21
def column2notes(column):
notes = []
for i in range(len(column)):
if column[i] > 255/2:
notes.append(i+lowerBoundNote)
return notes
resolution = 0.25
def updateNotes(newNotes,prevNotes):
res = {}
for note in newNotes:
if note in prevNotes:
res[note] = prevNotes[note] + resolution
else:
res[note] = resolution
return res
def image2midi(image_path):
with Image.open(image_path) as image:
im_arr = np.fromstring(image.tobytes(), dtype=np.uint8)
try:
im_arr = im_arr.reshape((image.size[1], image.size[0]))
except:
im_arr = im_arr.reshape((image.size[1], image.size[0],3))
im_arr = np.dot(im_arr, [0.33, 0.33, 0.33])
""" convert the output from the prediction to notes and create a midi file
from the notes """
offset = 0
output_notes = []
# create note and chord objects based on the values generated by the model
prev_notes = updateNotes(im_arr.T[0,:],{})
for column in im_arr.T[1:,:]:
notes = column2notes(column)
# pattern is a chord
notes_in_chord = notes
old_notes = prev_notes.keys()
for old_note in old_notes:
if not old_note in notes_in_chord:
new_note = note.Note(old_note,quarterLength=prev_notes[old_note])
new_note.storedInstrument = instrument.Piano()
if offset - prev_notes[old_note] >= 0:
new_note.offset = offset - prev_notes[old_note]
output_notes.append(new_note)
elif offset == 0:
new_note.offset = offset
output_notes.append(new_note)
else:
print(offset,prev_notes[old_note],old_note)
prev_notes = updateNotes(notes_in_chord,prev_notes)
# increase offset each iteration so that notes do not stack
offset += resolution
for old_note in prev_notes.keys():
new_note = note.Note(old_note,quarterLength=prev_notes[old_note])
new_note.storedInstrument = instrument.Piano()
new_note.offset = offset - prev_notes[old_note]
output_notes.append(new_note)
prev_notes = updateNotes(notes_in_chord,prev_notes)
midi_stream = stream.Stream(output_notes)
midi_stream.write('midi', fp=image_path.split("/")[-1].replace(".jpeg",".mid"))
import sys
image_path = sys.argv[1]
image2midi(image_path)
and this is the code I execute in terminal to geneate midi from image:
python img2midi.py samples/image.png
I need to make the above code to loop over all input images I put inside samples folder and generate midi for each, not just one file at a time.
Any help would be much appreciated.
You can do this by getting list of images from directory and iterate over them.
import sys
import os
sample_folder_path = sys.argv[1]
images = os.listdir(sample_folder_path) # getting all images stored in sample folder
images_path = [os.path.abspath(f"{sample_folder_path}/{image}") for image in images] # gets absolute path for all images
for image_path in images_path:
image2midi(image_path)
Usage:
python img2midi.py folder_path
Here folder_path is the path of folder that contains images. It can
I have 1,440 audio files to feed into a neural network. The problem is that they are not all the same length. I used the answer posted on:
Adding silent frame to wav file using python
but it doesn't seem to work. I wanted to add a few seconds of silence to the end of my files, and then trim them to all be 5 seconds long. Can someone please help me with this?
(I also tried using pysox, but that gives me the This install of SoX cannot process .wav files. error.)
I am using Google Colab for this. The code is:
import wave, os, glob
from pydub import AudioSegment
from pydub.playback import play
path = 'drive/MyDrive/Ravdess/Sad' #This is the folder from my Google Drive which has the audio files
count = 0
for filename in glob.glob(os.path.join(path, '*.wav')):
w = wave.open(filename, 'r')
d = w.readframes(w.getnframes())
frames = w.getnframes()
rate = w.getframerate()
duration = frames/float(rate)
count+=1
print(filename, "count =", count, "duration = ", duration)
audio_in_file = filename
audio_out_file = "out.wav"
new_duration = duration
#Only append silence until time = 5 seconds.
one_sec = AudioSegment.silent(duration=2000) #duration in milliseconds
song = AudioSegment.from_wav(audio_in_file)
final_song = one_sec + song
new_frames = w.getnframes()
new_rate = w.getframerate()
new_duration = new_frames/float(rate)
final_song.export(audio_out_file, format="wav")
print(final_song, "count =", count, "new duration = ", new_duration)
w.close()
This gives the output:
drive/MyDrive/Ravdess/Sad/03-01-04-01-02-01-01.wav count = 1 duration = 3.5035
<pydub.audio_segment.AudioSegment object at 0x7fd5b7ca06a0> count = 1 new duration = 3.5035
drive/MyDrive/Ravdess/Sad/03-01-04-01-02-02-01.wav count = 2 duration = 3.370041666666667
<pydub.audio_segment.AudioSegment object at 0x7fd5b7cbc860> count = 2 new duration = 3.370041666666667
... (and so on for all the files)
Since you are already using pydub, I'd do something like this:
from pydub import AudioSegment
from pydub.playback import play
input_wav_file = "/path/to/input.wav"
output_wav_file = "/path/to/output.wav"
target_wav_time = 5 * 1000 # 5 seconds (or 5000 milliseconds)
original_segment = AudioSegment.from_wav(input_wav_file)
silence_duration = target_wav_time - len(original_segment)
silenced_segment = AudioSegment.silent(duration=silence_duration)
combined_segment = original_segment + silenced_segment
combined_segment.export(output_wav_file, format="wav")
This is my first post. Is it possible to change the speed of a playback during playback? I want to simulate a car engine sound and for this the first step is to change the speed of a looped sample according to the RPM of the engine. I know how to increase the speed of a complete sample using pyaudio by changing the rate of the wave file, but I want to have a contineous change of the rate. Is this possible without using the scikits.samplerate package, which allows resampling (and is quite old) or pysonic, which is superold?
This is what I have at the moment:
import pygame, sys
import numpy as np
import pyaudio
import wave
from pygame.locals import *
import random as rd
import os
import time
pygame.init()
class AudioFile:
chunk = 1024
def __init__(self, file, speed):
""" Init audio stream """
self.wf = wave.open(file, 'rb')
self.speed = speed
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format = self.p.get_format_from_width(self.wf.getsampwidth()),
channels = 1,
rate = speed,
output = True)
def play(self):
""" Play entire file """
data = self.wf.readframes(self.chunk)
while data != '':
self.stream.write(data)
def close(self):
""" Graceful shutdown """
self.stream.close()
self.p.terminate()
a = AudioFile("wave.wav")
a.play()
You should be able to do something with numpy. I'm not really familiar with wave etc. and I would expect your play() method to include a readframes() inside the loop in some way (as I attemp to do here) but you can probably get the idea from this
def play(self):
""" Play entire file """
x0 = np.linspace(0.0, self.chunk - 1.0, self.chunk)
x1 = np.linspace(0.0, self.chunk - 1.0, self.chunk * self.factor) # i.e. 0.5 will play twice as fast
data = ''
while data != '':
f_data = np.fromstring(self.wf.readframes(self.chunk),
dtype=np.int).astype(np.float) # need to use floats for interpolation
if len(f_data) < self.chunk:
x1 = x1[:int(len(f_data) * self.factor)]
data = np.interp(x1, x0, f_data).astype(np.int)
self.stream.write(data)
Obviously this uses the same speed up or slow down factor for the whole play. If you wanted to change it mid play you would have to modify x1 inside the while loop.