Detect beep sound from Audio file using python
this code I found from somewhere but it is not giving the actual beep result
means in the audio where beep is not there then also this code is showing beep in it
from moviepy.editor import *
import matplotlib.pyplot as plt
import cv2
#from time import sleep
import sounddevice as sd
from scipy.io import wavfile
import numpy as np
filename = 'C:/Users/YahyaSirguroh/Downloads/output.mp4'
video = VideoFileClip(filename)
audio = video.audio
duration = video.duration
audio.write_audiofile("audio.wav")
#sleep(0.3)
samplerate, data = wavfile.read('audio.wav')
step = 30
audio_signal = []
cnt = 0
flag = 0
text = ''
for t in range(int(duration*step)):
t = t/step
if cnt:
flag+=1
if t > audio.duration or t > video.duration: break
audio_frame = audio.get_frame(t) #numpy array representing mono/stereo values
audio_signal.extend(list(audio_frame))
if (audio_frame>0.6).sum()==2:
cnt+=1
if cnt>=2:
print('beep detected at %5.2f' %(t))
text = 'beep detected at %d' %(np.round(t))
if flag>=4:
cnt=0
flag=0
Related
I am attempting to use the speech recognition toolkit VOSK and the speech diarization package Resemblyzer to transcibe audio and then identify the speakers in the audio.
Tools:
https://github.com/alphacep/vosk-api
https://github.com/resemble-ai/Resemblyzer
I can do both things individually but run into issues when trying to do them when running the one python script.
I used the following guide when setting up the diarization system:
https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279
Computer specs are as follows:
Intel(R) Core(TM) i3-7100 CPU # 3.90GHz, 3912 Mhz, 2 Core(s), 4 Logical Processor(s)
32GB RAM
The following is my code, I am not to sure if using threading is appropriate or if I even implemented it correctly, how can I best optimize this code as to achieve the results I am looking for and not crash.
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import json
import sys
import os
import subprocess
import datetime
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from resemblyzer.hparams import sampling_rate
from spectralcluster import SpectralClusterer
import threading
import queue
import gc
def recognition(queue, audio, FRAME_RATE):
model = Model("Vosk_Models/vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)
rec.AcceptWaveform(audio.raw_data)
result = rec.Result()
transcript = json.loads(result)#["text"]
#return transcript
queue.put(transcript)
def diarization(queue, audio):
wav = preprocess_wav(audio)
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)
print(cont_embeds.shape)
clusterer = SpectralClusterer(
min_clusters=2,
max_clusters=100,
p_percentile=0.90,
gaussian_blur_sigma=1)
labels = clusterer.predict(cont_embeds)
def create_labelling(labels, wav_splits):
times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
labelling = []
start_time = 0
for i, time in enumerate(times):
if i > 0 and labels[i] != labels[i - 1]:
temp = [str(labels[i - 1]), start_time, time]
labelling.append(tuple(temp))
start_time = time
if i == len(times) - 1:
temp = [str(labels[i]), start_time, time]
labelling.append(tuple(temp))
return labelling
#return
labelling = create_labelling(labels, wav_splits)
queue.put(labelling)
def identify_speaker(queue1, queue2):
transcript = queue1.get()
labelling = queue2.get()
for speaker in labelling:
speakerID = speaker[0]
speakerStart = speaker[1]
speakerEnd = speaker[2]
result = transcript['result']
words = [r['word'] for r in result if speakerStart < r['start'] < speakerEnd]
#return
print("Speaker",speakerID,":",' '.join(words), "\n")
def main():
queue1 = queue.Queue()
queue2 = queue.Queue()
FRAME_RATE = 16000
CHANNELS = 1
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
podcast = podcast.set_channels(CHANNELS)
podcast = podcast.set_frame_rate(FRAME_RATE)
first_thread = threading.Thread(target=recognition, args=(queue1, podcast, FRAME_RATE))
second_thread = threading.Thread(target=diarization, args=(queue2, podcast))
third_thread = threading.Thread(target=identify_speaker, args=(queue1, queue2))
first_thread.start()
first_thread.join()
gc.collect()
second_thread.start()
second_thread.join()
gc.collect()
third_thread.start()
third_thread.join()
gc.collect()
# transcript = recognition(podcast,FRAME_RATE)
#
# labelling = diarization(podcast)
#
# print(identify_speaker(transcript, labelling))
if __name__ == '__main__':
main()
When I say crash I mean everything freezes, I have to hold down the power button on the desktop and turn it back on again. No blue/blank screen, just frozen in my IDE looking at my code. Any help in resolving this issue would be greatly appreciated.
Pydubs AudioSegment was not returning a suitable type for the Resembylzer function preprocess_wav.
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
preprocess_wav instead requires a Numpy Array / Path.
audio_file_path = 'Podcast_Audio/WAV-Film-Release-Clip.wav'
wav_fpath = Path(audio_file_path)
wav = preprocess_wav(wav_fpath)
Additionally preprocess_wav functionality can be achieved using Librosa if desired.
import librosa
def preprocess_wav(waveform, sr):
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
waveform = waveform.astype(np.float32) / np.max(np.abs(waveform))
return waveform
waveform, sr = librosa.load('Podcast_Audio/WAV-Film-Release-Clip.wav')
wav = preprocess_wav(waveform, sr)
I am using streamlit
I want to prepare audio dataset, so for that I have a .csv file which contains n numbers of lines which should be spoken by a user and voice of that user should be saved in a folder
Structure of CSV FILE
Sr#
Word
1
Hello
2
World
this file should be loaded and one-by-one all the words should be shown and below that word user should record sound and then press save button to save that audio and then next word appear. I have written the code but it is not working properly
import re
from turtle import onclick
from zlib import DEFLATED
from requests import session
from streamlit_option_menu import option_menu
import streamlit as st
import pandas as pd
import os
import sounddevice as sd
import wavio
def record(duration=5, fs=48000):
sd.default.samplerate = fs
sd.default.channels = 1
myrecording = sd.rec(int(duration * fs))
sd.wait(duration)
return myrecording
def save_record(path_myrecording, myrecording, fs):
wavio.write(path_myrecording, myrecording, fs, sampwidth=2)
return None
def read_audio(file):
with open(file, "rb") as audio_file:
audio_bytes = audio_file.read()
return audio_bytes
st.header("Data Pipeline")
with st.sidebar:
choose = option_menu("Modules", ["1. Recording","2. Preprocess Data","3. Create
Dataset"],
icons=['headset', 'mic', 'clipboard'],
default_index=0)
my_list=['a','b']
if choose=="1. Recording":
st.header("Voice Recording")
st.text("Follow the following steps")
s_name=''
s_name= st.text_input("1. Enter Your Name")
if os.path.isdir("./Recordings/"+s_name):
pass
else:
os.mkdir("./Recordings/"+s_name)
file =st.file_uploader("2. Upload an csv file", type=
['csv'],accept_multiple_files=False,key='file')
# if file:
# df= pd.read_csv(file)
# for i in df['Urdu']:
# my_list.append(i)
show_next = st.button("next", disabled = False)
if "current_index" not in st.session_state:
st.session_state.current_index = 0
st.session_state.load_state=True
# Whenever someone clicks on the button
if show_next and st.session_state.load_state :
# Show next element in list
st.write(my_list[st.session_state.current_index])
Record=st.button('Record', key='rec')
if st.session_state.get('rec')!=True:
st.session_state['rec']=True
if st.session_state['rec']==True:
print('i am here')
record_state = st.text("Recording...")
duration = 1 # seconds
fs = 48000
myrecording = record(duration, fs)
path_myrecording = f"./temp/AMAD.wav"
save_record(path_myrecording, myrecording, fs)
st.audio(read_audio(path_myrecording))
path="./Recordings/"+s_name+"/1.wav"
os.remove(path_myrecording)
save_record(path, myrecording, fs)
st.success("Recording Saved")
# Update and store the index
st.session_state.current_index += 1
if len(my_list) == st.session_state.current_index :
st.session_state.load_state = False
st.session_state
I'm trying to make a video the same duration as audio clips
This kinda works, but after 2 seconds (subclip duration), the image just freezes as the audio continues
I was trying to achieve the same behavior as in this tutorial, where it seems that the video repeats itself. My original video has only 2 seconds
import moviepy.editor as mp
raw_video = mp.VideoFileClip("videotest.mp4", audio=False)
raw_audio = mp.AudioFileClip("frei.mp3")
raw_video = raw_video.subclip(0, 2)
my_video = raw_video.set_duration(raw_audio.duration)
my_video.audio = raw_audio
my_video.write_videofile('result.mp4')
This is the solution I've found, but don't really know if there is a better way. Is taking too long to write the video
import moviepy.editor as mp
import math
raw_video = mp.VideoFileClip("videotest.mp4", audio=False)
raw_audio = mp.AudioFileClip("frei.mp3")
# array de vídeos até completar a duração do áudio
amount = math.ceil(raw_audio.duration / raw_video.duration)
list = [raw_video for i in range(amount)]
final_video = mp.concatenate_videoclips(list, method='compose')
final_video.audio = raw_audio
final_video.write_videofile('result42.mp4')
so I'm trying to read out text from MS Teams and use that text to make inputs on the keyboard.
Right now, I work with the threading module to have one thread for the input and one thread for the image_to_string. Following is the function for the image_to_string.
def imToString():
global message
print("Image getting read")
pytesseract.pytesseract.tesseract_cmd ='C:\\Users\\gornicec\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'
while(True):
print("preIMGgrab")
cap = ImageGrab.grab(bbox=(177, 850, 283, 881))
grayCap = cv2.cvtColor(np.array(cap), cv2.COLOR_BGR2GRAY)
print("postIMGgrab")
t = time.perf_counter()
print("preMSG" + str(t))
message = pytesseract.image_to_string(
grayCap,
lang ='deu',config='--psm 6')
print(str(message) + "was read" + str(time.perf_counter() - t))
I don't know how but it takes about 8 seconds to read an image thats 1000 pixels big. I need this to be at highest 2 seconds. I'll add the whole code at the end. If there is any way to make it faster or to do it differently please tell me so.
WHOLE CODE:
import numpy as np
import time
import pytesseract
from win32gui import GetWindowText, GetForegroundWindow
import win32api
import cv2
import pyautogui
from PIL import ImageGrab
import threading
from ahk import AHK
import keyboard
message = ""
ahk = AHK(executable_path='C:\\Program Files\\AutoHotkey\\AutoHotkey.exe')
def Controls():
global message
while True:
booleanVal = True
if booleanVal:
#imToString()
print("message")
#print("rechts" in message.lower())
#print(f'LÄNGE: {len(message)}')
if "vorne" in message.lower():
# Control(message, 'w')
ahk.key_press('w')
#message = ""
if "hinten" in message.lower():
# Control(message, 's')
ahk.key_press('s')
#message = ""
if "links" in message.lower():
# Control(message, 'a')
ahk.key_press('a')
#message = ""
if "rechts" in message.lower():
# Control(message, 'd')
#print("HAHAHA")
ahk.key_press('d')
#message = ""
if "greif" in message.lower():
ahk.key_press('space')
#message = ""
time.sleep(0.5)
#IMGTOSTRING---
controls = threading.Thread(target=Controls)
controls.start()
grab = threading.Thread(target=imToString)
grab.start()
pytesseract is not suit for large amount of images or images that are already in memory, its write them to a file and then pass the file path to tesseract cli, if you want to improve the performance of you script try using library that works directly with tesseract api.
like this: https://pypi.org/project/tess-py-api/
The overall problem I'm trying to tackle is I want to find out if videoA is a subset of videoB. So if videoA is 2 seconds clip and videoB is 40s, do those two seconds occur in those 40s. To do this I'm reading video files with VideoCapture, and saving each frame of my video in a dictionary. But while reading videos I see this:
Videos/viper_SpineyBitterGoatOhMyDog.mp4
Videos/tsm_theoddone_CoyThirstyAlmondDoggo.mp4
Videos/wingsofdeath_HelplessOilyYogurtGOWSkull.mp4
Videos/rinnieriot_AbstruseEnjoyableOkapiCopyThis.mp4
Videos/imls_UnusualHelplessDogFloof.mp4
Videos/4.mp4
Videos/solorenektononly_LaconicFamousEggplantTwitchRaid.mp4
Videos/gripex90_CheerfulNeighborlyJayTheThing.mp4
Videos/tarzaned_GrossFlirtyMooseFutureMan.mp4
Videos/imaqtpie_LightPleasantPhoneSquadGoals.mp4
Killed: 9
from moviepy.editor import *
import imageio
import json
from moviepy.video.io.VideoFileClip import VideoFileClip
import random
import cv2
import os
import threading
from skimage.measure import compare_ssim
import argparse
import imutils
start = datetime.datetime.now()
videoPaths = self.getListOfDownloadedVideos()
videosByFrame = {}
for path in videoPaths:
print path
images = []
vidcap = cv2.VideoCapture(path)
success = True
while success:
#print "Before READ"
success,image = vidcap.read()
#print "After READ"
images.append(image)
videosByFrame[path] = images
cv2.destroyAllWindows()
vidcap.release()
Is there something in vidcap.read() that would cause it to kill my python script?