I am very green about programming but wish to learn and develop.
I want to write a simple application that will be useful in linguistic treatments - but at first it is simple demo.
The application is about to display image and record sound during projection.
There are few variables - interval and image/sound/movie clip paths - taken from external txt file (for the beginning - later I would like to perform some creator with presaved configurations).
The config file now looks like:
10
path1
path2
...
The first line is about to input interval in seconds, next there are paths to images, sounds or movie clips (I tried with images for now).
#!/usr/bin/python
# main.py
import sys
from PyQt4 import QtGui, QtCore
from Tkinter import *
import numpy as np
import pyaudio
import wave
import time
from PIL import Image, ImageTk
import multiprocessing
import threading
from threading import Thread
master = Tk()
conf_file = open("conf.txt", "r") #open conf file read only
conf_lines = conf_file.readlines()
conf_file.close()
interwal = conf_lines[0] #interval value from conf.txt file
bodziec1 = conf_lines[1] #paths to stimulus file (img / audio / video)
bodziec2 = conf_lines[2]
bodziec3 = conf_lines[3]
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = interwal #every stimulus has it's own audio record file for further work
timestr = time.strftime("%Y%m%d-%H%M%S") #filename is set to year / month / day - hour / minute / second for easier systematization
def nagrywanie(): #recording action - found somewhere in the all-knowing web
p = pyaudio.PyAudo()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* nagrywanie") #info about record to start
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* koniec nagrywania") #info about record to end
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(timestr, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
def bod1(): #stimulus 1st to display / play
image = Image.open(bodziec1)
photo = ImageTk.PhotoImage(image)
def bod2():
image = Image.open(bodziec2) #stimulus 2nd to display / play
photo = ImageTk.PhotoImage(image)
def bod3():
image = Image.open(bodziec3) #stimulus 3rd to display / play
photo = ImageTk.PhotoImage(image)
def odpal(): #attemption to run display and recording at the same time
Thread(target = bod1).start()
Thread(target = nagrywanie).start()
# Wait interwal for odpal #give impetus for time in first line of the conf.txt
time.sleep(interwal)
# Terminate odpal #stop giving impetus
bod1.terminate()
# Cleanup #?? this part is also copied from all-knowing internet
p.join()
b = Button(master, text="OK", command=odpal) #wanted the program to be easier for non-programmers to operate so few buttons are necessary
b.pack()
mainloop()
When asked few programmers about the code it is as simple as riding a bike, so I wanted to learn how to write it by myself.
I guess it is peace of cake for professionals - 1000s of thanks to these ones who want even to read this junk.
It takes a lot of time for me to understand and figure out the exact commends that is why I am asking politely about the help - not only for education but also for better diagnosis.
Excuse me for the language - English is not my native language.
Related
I am attempting to use the speech recognition toolkit VOSK and the speech diarization package Resemblyzer to transcibe audio and then identify the speakers in the audio.
Tools:
https://github.com/alphacep/vosk-api
https://github.com/resemble-ai/Resemblyzer
I can do both things individually but run into issues when trying to do them when running the one python script.
I used the following guide when setting up the diarization system:
https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279
Computer specs are as follows:
Intel(R) Core(TM) i3-7100 CPU # 3.90GHz, 3912 Mhz, 2 Core(s), 4 Logical Processor(s)
32GB RAM
The following is my code, I am not to sure if using threading is appropriate or if I even implemented it correctly, how can I best optimize this code as to achieve the results I am looking for and not crash.
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import json
import sys
import os
import subprocess
import datetime
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from resemblyzer.hparams import sampling_rate
from spectralcluster import SpectralClusterer
import threading
import queue
import gc
def recognition(queue, audio, FRAME_RATE):
model = Model("Vosk_Models/vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)
rec.AcceptWaveform(audio.raw_data)
result = rec.Result()
transcript = json.loads(result)#["text"]
#return transcript
queue.put(transcript)
def diarization(queue, audio):
wav = preprocess_wav(audio)
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)
print(cont_embeds.shape)
clusterer = SpectralClusterer(
min_clusters=2,
max_clusters=100,
p_percentile=0.90,
gaussian_blur_sigma=1)
labels = clusterer.predict(cont_embeds)
def create_labelling(labels, wav_splits):
times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
labelling = []
start_time = 0
for i, time in enumerate(times):
if i > 0 and labels[i] != labels[i - 1]:
temp = [str(labels[i - 1]), start_time, time]
labelling.append(tuple(temp))
start_time = time
if i == len(times) - 1:
temp = [str(labels[i]), start_time, time]
labelling.append(tuple(temp))
return labelling
#return
labelling = create_labelling(labels, wav_splits)
queue.put(labelling)
def identify_speaker(queue1, queue2):
transcript = queue1.get()
labelling = queue2.get()
for speaker in labelling:
speakerID = speaker[0]
speakerStart = speaker[1]
speakerEnd = speaker[2]
result = transcript['result']
words = [r['word'] for r in result if speakerStart < r['start'] < speakerEnd]
#return
print("Speaker",speakerID,":",' '.join(words), "\n")
def main():
queue1 = queue.Queue()
queue2 = queue.Queue()
FRAME_RATE = 16000
CHANNELS = 1
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
podcast = podcast.set_channels(CHANNELS)
podcast = podcast.set_frame_rate(FRAME_RATE)
first_thread = threading.Thread(target=recognition, args=(queue1, podcast, FRAME_RATE))
second_thread = threading.Thread(target=diarization, args=(queue2, podcast))
third_thread = threading.Thread(target=identify_speaker, args=(queue1, queue2))
first_thread.start()
first_thread.join()
gc.collect()
second_thread.start()
second_thread.join()
gc.collect()
third_thread.start()
third_thread.join()
gc.collect()
# transcript = recognition(podcast,FRAME_RATE)
#
# labelling = diarization(podcast)
#
# print(identify_speaker(transcript, labelling))
if __name__ == '__main__':
main()
When I say crash I mean everything freezes, I have to hold down the power button on the desktop and turn it back on again. No blue/blank screen, just frozen in my IDE looking at my code. Any help in resolving this issue would be greatly appreciated.
Pydubs AudioSegment was not returning a suitable type for the Resembylzer function preprocess_wav.
podcast = AudioSegment.from_mp3("Podcast_Audio/Film-Release-Clip.mp3")
preprocess_wav instead requires a Numpy Array / Path.
audio_file_path = 'Podcast_Audio/WAV-Film-Release-Clip.wav'
wav_fpath = Path(audio_file_path)
wav = preprocess_wav(wav_fpath)
Additionally preprocess_wav functionality can be achieved using Librosa if desired.
import librosa
def preprocess_wav(waveform, sr):
waveform = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
waveform = waveform.astype(np.float32) / np.max(np.abs(waveform))
return waveform
waveform, sr = librosa.load('Podcast_Audio/WAV-Film-Release-Clip.wav')
wav = preprocess_wav(waveform, sr)
I'm trying to make a video the same duration as audio clips
This kinda works, but after 2 seconds (subclip duration), the image just freezes as the audio continues
I was trying to achieve the same behavior as in this tutorial, where it seems that the video repeats itself. My original video has only 2 seconds
import moviepy.editor as mp
raw_video = mp.VideoFileClip("videotest.mp4", audio=False)
raw_audio = mp.AudioFileClip("frei.mp3")
raw_video = raw_video.subclip(0, 2)
my_video = raw_video.set_duration(raw_audio.duration)
my_video.audio = raw_audio
my_video.write_videofile('result.mp4')
This is the solution I've found, but don't really know if there is a better way. Is taking too long to write the video
import moviepy.editor as mp
import math
raw_video = mp.VideoFileClip("videotest.mp4", audio=False)
raw_audio = mp.AudioFileClip("frei.mp3")
# array de vídeos até completar a duração do áudio
amount = math.ceil(raw_audio.duration / raw_video.duration)
list = [raw_video for i in range(amount)]
final_video = mp.concatenate_videoclips(list, method='compose')
final_video.audio = raw_audio
final_video.write_videofile('result42.mp4')
I am creating a raspberry pi timelapse camera encoding video with CV2 videowriter
Each image captured with picamera is added to the videowriter and once the intended number of images are taken the videowriter closes.
However - while this works for a few thousand images - it stops at some limit with a filesize of 366Mb which is now frustrating me and I ask you - the internet and hoard of coders to tell me why I am bad a coding and how to fix this - you must be tempted by this..
Here is my offering of garbage for you to laugh pitifully at
import os, cv2
from picamera import PiCamera
from picamera.array import PiRGBArray
from datetime import datetime
from time import sleep
now = datetime.now()
x = now.strftime("%Y")+"-"+now.strftime("%m")+"-"+now.strftime("%d")+"-"+now.strftime("%H")+"-"+now.strftime("%M") #string of dateandtimestart
print(x)
def main():
imagenum = 10000 #how many images
period = 1 #seconds between images
os.chdir ("/home/pi/t_lapse")
os.mkdir(x)
os.chdir(x)
filename = x + ".avi"
camera = PiCamera()
camera.resolution=(1920,1088)
camera.vflip = True
camera.hflip = True
camera.color_effects = (128,128) #makes a black and white image for IR camera
sleep(0.1)
out = cv2.VideoWriter(filename, cv2.cv.CV_FOURCC(*'XVID'), 30, (1920,1088))
for c in range(imagenum):
with PiRGBArray(camera, size=(1920,1088)) as output:
camera.capture(output, 'bgr')
imagec = output.array
out.write(imagec)
output.truncate(0) #trying to get more than 300mb files..
pass
sleep(period-0.5)
camera.close()
out.release()
if __name__ == '__main__':
main()
This example is a part of the whole code I've written (https://github.com/gchennell/RPi-PiLapse) which has an OLED display and buttons and selection of how many images as I have this all in an enclosure - the number of images seems to be limited to about 3000-4000 and then it just gives up and goes home... I tried adding the output.truncate(0)
I have also recreated this in python3 before you cry "BUT CV2.CV2.VIDEOWRITER!!!!" and that hasn't changed a thing - I'm missing something here...
I want to merge both audio files but it seems as if there is a pause of 2 Sec. Can anyone look into it further? It would be a great help.
import simpleaudio as sa
filename = '3.wav'
wave_obj = sa.WaveObject.from_wave_file(filename)
play_obj = wave_obj.play()
play_obj.wait_done()
filename = '4.wav'
wave_obj = sa.WaveObject.from_wave_file(filename)
play_obj = wave_obj.play()
play_obj.wait_done()`
I believe the problem is that after 3.wav ended, it takes a little time for the program to process the rest of the code. Let the program process both before starting one:
import simpleaudio as sa
filename1 = '3.wav'
filename2 = '4.wav'
wave_obj1 = sa.WaveObject.from_wave_file(filename1)
wave_obj2 = sa.WaveObject.from_wave_file(filename2)
play_obj1 = wave_obj1.play()
play_obj1.wait_done()
play_obj2 = wave_obj2.play()
play_obj2.wait_done()
This is my first post. Is it possible to change the speed of a playback during playback? I want to simulate a car engine sound and for this the first step is to change the speed of a looped sample according to the RPM of the engine. I know how to increase the speed of a complete sample using pyaudio by changing the rate of the wave file, but I want to have a contineous change of the rate. Is this possible without using the scikits.samplerate package, which allows resampling (and is quite old) or pysonic, which is superold?
This is what I have at the moment:
import pygame, sys
import numpy as np
import pyaudio
import wave
from pygame.locals import *
import random as rd
import os
import time
pygame.init()
class AudioFile:
chunk = 1024
def __init__(self, file, speed):
""" Init audio stream """
self.wf = wave.open(file, 'rb')
self.speed = speed
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format = self.p.get_format_from_width(self.wf.getsampwidth()),
channels = 1,
rate = speed,
output = True)
def play(self):
""" Play entire file """
data = self.wf.readframes(self.chunk)
while data != '':
self.stream.write(data)
def close(self):
""" Graceful shutdown """
self.stream.close()
self.p.terminate()
a = AudioFile("wave.wav")
a.play()
You should be able to do something with numpy. I'm not really familiar with wave etc. and I would expect your play() method to include a readframes() inside the loop in some way (as I attemp to do here) but you can probably get the idea from this
def play(self):
""" Play entire file """
x0 = np.linspace(0.0, self.chunk - 1.0, self.chunk)
x1 = np.linspace(0.0, self.chunk - 1.0, self.chunk * self.factor) # i.e. 0.5 will play twice as fast
data = ''
while data != '':
f_data = np.fromstring(self.wf.readframes(self.chunk),
dtype=np.int).astype(np.float) # need to use floats for interpolation
if len(f_data) < self.chunk:
x1 = x1[:int(len(f_data) * self.factor)]
data = np.interp(x1, x0, f_data).astype(np.int)
self.stream.write(data)
Obviously this uses the same speed up or slow down factor for the whole play. If you wanted to change it mid play you would have to modify x1 inside the while loop.