Purge script with mutltiple conditions - python

I'm writing yet another python purge script. This is replacing a very old bash script with tons of find -delete which take up to 9h to purge our video backend.
I know there is tons of those either on stack or right in google but thing is i have a few more constraints which left me to write what i find poor/unefficient code.
consider the following dir structure:
/data/channel1/video_800/0001/somefile_800_001.ts
/data/channel1/video_800/0001/somefile_800_002.ts
/data/channel1/video_800/0002/somediffile_800_001.ts
/data/channel1/video_800/0002/somediffile_800_002.ts
/data/channel1/video_800.m3u8
/data/channel1/video_900/0001/someotherfile_900_001.ts
/data/channel1/video_900/0002/afile_900_001.ts
/data/channel1/video_900/0003/bfile_900_001.ts
/data/channel1/video_900/0003/cfile_900_001.ts
/data/channel1/video_900.m3u8
/data/channel2/video_800/0001/againsomefile_800_001.ts
/data/channel2/video_800/0001/againsomefile_800_001.ts
/data/channel2/video_800.m3u8
/data/sport_channel/video_1000/0001/somefile.ts
/data/sport_channel/video_1000/0001/somefile2.ts
First thing that interests me is the channel name since there is a rule for channel* and one for sport*.
Second thing is the end of the video dirs that equals the bitrate... 800, 900, 1000 since these can have different retention days.
Finaly i'm going through everything and remove files based on bitrate and extention.
The bellow code works but is overly complicated and i'm sure not very pythonic. Since what i care most in this case is performance i'm sure there is a more efficient way to do this. Stacking for loop in for loop is not only poor design but also gets me a 'find_files' is too complex [mccabe] in my pymode.
** Left the remove function out of the code example but it's just a plain try:except using os.rmdir and os.remove
I'm open to all suggestions to improving my code.
Thanks!
#!/usr/bin/python
import os
import time
import fnmatch
path = '/data'
debits_short = ['200', '700', '1000', '1300', '2500']
debits_long = ['400', '1800']
def find_files(chan_name, debits, duration):
time_in_secs = time.time() - (duration * 24 * 60 * 60)
# List channel
for channel in os.listdir(path):
# Match category channels
if fnmatch.fnmatch(channel, chan_name):
# Go through bitrates
for debit in debits:
# Channel path now the default search path
channel_path = path + channel
# Walk through channel path to match bitrate files
for root, dirs, files in os.walk(channel_path, topdown=False):
for filename in files:
# Remove files that contain _bitrate_ and end with ts
if '_' + debit + '_' in filename:
if filename.endswith('.ts'):
if os.path.isfile(os.path.join(root, filename)):
if os.stat(os.path.join(root, filename)).st_mtime <= time_in_secs:
remove(os.path.join(root, filename))
# Remove playlist files that contain bitrate.m3u8
if filename.endswith(debit + '.m3u8'):
if os.path.isfile(os.path.join(root, filename)):
if os.stat(os.path.join(root, filename)).st_mtime <= time_in_secs:
remove(os.path.join(root, filename))
# Remove empty dirs
for dir in dirs:
if not os.listdir(os.path.join(root, dir)):
remove(os.path.join(root, dir))
find_files('channel*', debits_long, 3)
find_files('sport*', debits_short, 7)

Here's a possible approach:
import os
import glob
import time
class Purge(object):
removable_extensions = ['ts', 'm3u8']
def __init__(self, basedir, channel_pattern, debits,
older_than_days, test_mode=False):
self.basedir = basedir
self.channel_pattern = channel_pattern
self.debits = debits
self.older_than_secs = time.time() - 24*60*60*older_than_days
self.test_mode = test_mode # If `True`, do not delete files.
def delete_file(self, filepath):
try:
os.remove(filepath)
except OSError:
pass
def file_for_deletion(self, filepath):
# Return `True` if a file meets all conditions for deletion.
filename, ext = os.path.splitext(os.path.basename(filepath))
condition_ext = ext[1:] in self.removable_extensions
condition_old = os.stat(filepath).st_mtime <= self.older_than_secs
condition_deb = any(
'_{}_'.format(d) in filename or filename.endswith(d)
for d in self.debits
)
return all((condition_ext, condition_old, condition_deb))
def purge_channel(self, channel_dir):
for root, dirs, files in os.walk(channel_dir):
for name in files:
filepath = os.path.join(root, name)
if self.file_for_deletion(filepath):
print filepath
if not self.test_mode:
self.delete_file(filepath)
#TODO: delete empty directories here.
def purge(self):
channels = glob.glob(os.path.join(self.basedir, self.channel_pattern))
for channel_dir in channels:
self.purge_channel(channel_dir)
if __name__ == '__main__':
purge_job_info = dict(
basedir=r'path/to/data', # All channel folders live here.
channel_pattern='channel*', # `glob` pattern.
debits=['400', '1800'],
older_than_days=7,
)
p = Purge(**purge_job_info)
p.test_mode = True
p.purge()

Related

Find files within a changed subdirectory in Python

I have a text-file full of filenames. Like:
C:\Folder\Subfolder_01\file_1001.csv
C:\Folder\Subfolder_02\file_3030.xls
...
I want to check whether the files still exists (which is easy) or if the name of the subfolder has changed. The name of some subfolders changed by adding some string in front of it (starting with a 4 digit number e.g C:\Folder\Subfolder_02\file_3030.xls has changed to C:\Folder\2019 - Subfolder_02\file_3030.xls).
I tried to solve this with pathlib.glob(). It's possible to do this for one specific file 'by hand' like
list(file.parent.parent.glob('* - Subfolder_02\file_3030.xls'))
which returns a list with the new file-name. But i failed to do this in a loop surrounding the glob with parameters.
This is what I got so far, but my attempt to concatenate the glob with other variables (using +) failes for obvious reasons:
import pathlib
file = pathlib.Path(file_names.txt)
lines=[]
with open(file,'r') as f:
# reading the txt-file line by line
for line in f:
line = line.replace("\r", "").replace("\n", "")
lines.append(line)
for file in lines:
file = pathlib.Path(file)
# check if file exists ...
if file.exists():
print('OK - ' + file.name)
# ... if not, find new location
else:
new_files = list(file.parent.parent.glob('* - ') + file.name)
print(files_files)
I would set your top directory as a path and use that to glob the files under the directory if you can't find the file in its original location. Using ** in the glob will search all folders.
# Set top level directory as desired.
parent_dir = Path('.')
# you can use splitlines() to parse the file into a list
with Path('file_names.txt').open() as f:
files = f.read().splitlines()
for f in files:
orig = Path(f)
# Still in location, no need to look further
if orig.exists():
print(f"{orig.absolute()} is still in place.")
continue
# See if we can find it under parent_dir
matches = [*parent_dir.glob(f"**/{orig.name}")]
if len(matches) > 1:
print("Multiple Matches Found")
for match in matches:
print(f"{orig.absolute()} might be in {match.absolute()}")
Try watchdog
For example:
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
RESOURCES_PATH = "C:\Folder"
class dirs_watcher(FileSystemEventHandler):
def __init__(self):
self.observe()
self.cur_dirs = os.listdir(RESOURCES_PATH)
def observe(self):
self.observer = Observer()
self.my_watch = self.observer.schedule(self, path=RESOURCES_PATH, recursive=True)
self.observer.start()
def on_modified(self, event=None):
# A folder was modified:
self.new_dirs = os.listdir(RESOURCES_PATH)
old = set(self.cur_dirs) - set(self.new_dirs)
new = set(self.new_dirs) - set(self.cur_dirs)
print("{} changed to {}".format(old, new))
self.cur_dirs = self.new_dirs # update cur_dirs
on_modified will be triggered when a sub directory changes and you can extract the changed folders names by keeping a sub directories list

Need to upload sub-dirs and their contents, not just files in current dir

A script was supplied to me in order to upload files to a cloud bucket. You input the dir where the files you want to upload are and bingo bango, done.
What needs to happen is that there are additional sub dirs with their own files in them that I would like to transfer as well based on the input of the root dir. They would need to retain their tree structure relative to the root dir input.
Using the current code I get a write error/access denied fail. I know this is because the for loop is using os.listdir which can't parse the extra sub dirs and files but I'm not sure how to modify.
I attempted to get all the information I needed using os.walk and parsing that out. I verified with some print tests that it was looking in the right place for everything. However I hit a wall when I got this error when running the script:
folder\folder\lib\ntpath.py", line 76, in join
path = os.fspath(path)
TypeError: expected str, bytes or os.PathLike object, not list
I understand that something is being generated as a list when it shouldn't be but I'm not sure how to go about this...
This is the original script provided to me below. I have added the variable at the top just to be a little less abstract.
local_directory_path = 'C:\folder\folder\sync\FROM_LOCAL_UPLOAD'
def upload_folder_to_cloud(self, mount_id, local_directory_path):
''' This method will list every file at the local_directory_path and then for each,
it will call the api method athera.sync.upload_file for every file in your local directory
'''
_, destination_folder = os.path.split(local_directory_path)
if not destination_folder:
self.logger.error("Make sure the provided 'local_directory_path' does not end with a '/' or a '\\'")
sys.exit(2)
destination_folder = destination_folder + "/"
self.logger.info("Folder = {}".format(destination_folder))
for filename in os.listdir(local_directory_path):
destination_path = destination_folder + filename
filepath = os.path.join(local_directory_path, filename)
with open(filepath, "rb") as f:
_, err = self.client.upload_file(self.group_id, mount_id, f, destination_path=destination_path,)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
This is what I modified it to as a test:
for root, dirs, files in os.walk(local_directory_path):
srcFile = (os.path.join(files))
srcRoot = (os.path.join(root))
rootSplit = os.path.normpath(srcRoot).split(os.path.sep)
srcDirs = '/'.join(rootSplit[4:])
src = str('fixLocalFolder') + '/' + str(srcDirs) +'/'+ (files)
dst = str(srcDirs) + '/' + (files)
destination_folder = str(srcRoot) + "/"
destination_path = str(destination_folder) + str(srcFile)
filepath = os.path.join((str(srcDirs), str(srcFile)))
with open(filepath, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
I do not code for a living so I am sure I am not going about this the right way. I apologize for any code atrocities in advance. Thank you!
I do see some issues in that code, even without testing it. Something like the following might work for that loop. (Note! Untested!).
for root, dirs, files in os.walk(local_directory_path):
# Iterate through files in the currently processed directory
for current_file in files:
# Full path to file
src_file = os.path.join(root, current_file)
# Get the sub-path relative the original root.
sub_path = os.path.relpath(root, start=destination_folder)
# Get the destination path
destination_path = os.path.join(sub_path, current_file)
with open(src_file, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
I believe your central problem was misunderstanding what os.walk gives you. It gives you listing of each directory (and subdirectory), one after another.
Thus the values of one iterations might look like (when listing /mydir):
# First iteration:
root = "/mydir"
dirs = ["subdir", ...]
files = ["something.doc", "something else.txt"]
# Second iteration:
root = "/mydir/subdir"
dirs = ["sub-sub-dir1", ...]
files = ["file1.txt", "file2.txt", ...]

Find the large_files but output unexpected results

I have such a program to find the large files
import os, time, shelve
start = time.time()
root = '/'
# errors= set()
# dirs = set()
while True:
try:
root = os.path.abspath(root) #ensure its a abspath
#set the baseline as 100M
#consider the shift
baseline = 100 * 2**20 # 2*20 is1M
#setup to collect the large files
large_files = []
#root is a better choise as the a concept
for foldername, subfolders, files in os.walk(root):
for f in files:
# print(f"{foldername}, {f}")
abspath = os.path.join(foldername, f)
size = os.path.getsize(abspath)
if size >= baseline:
large_files.append((os.path.basename(abspath), size))
print(abspath, size/(2**20))
#write the large files to shelf
shelf = shelve.open('/root/large_files.db')
shelf["large_files"] = large_files
shelf.close()
if subfolders == []:
end = time.time()
break
except (PermissionError,FileNotFoundError) as e:
# errors.add(e)
pass
It consistently output the identical results
[root#iz2ze9wve43n2nyuvmsfx5z ~]# python3 search_large_files.py
/dev/core 134217726.0078125
/dev/core 134217726.0078125
/dev/core 134217726.0078125
....
However, I found no reasons that
print(abspath, size/(2**20))
will do this constantly.
What's the problem might be in my code:
You have an infinite outer loop with while True:, and apparently /dev/core is the only file in your filesystem that exceeds the file size specified by baseline, so it would keep outputting the same file over and over again.
Remove while True: and un-indent the block inside and your code would work.
Note that your if subfolders == []: condition is outside your for foldername, subfolders, files in os.walk(root): loop and would therefore not be useful. You should record the end time unconditionally anyway so you should simply remove the if condition and the break statement as well.

How to delete sub directories in a directory based on their time modified by using python?

based on this script:
#!/usr/bin/python
# run by crontab
# removes any files in /tmp/ older than 7 days
import os, sys, time
from subprocess import call
now = time.time()
cutoff = now - (7 * 86400)
files = os.listdir("/tmp")
for xfile in files:
if os.path.isfile( "/tmp/" + xfile ):
t = os.stat( "/tmp/" + xfile )
c = t.st_ctime
# delete file if older than a week
if c < cutoff:
os.remove("/tmp/" + xfile)
we can delete files in a path based on their time modified, but how can we delete folders in other folders based on their time modification?
it means there are many folders in the main folder but we need to keep main folders and subfolders and only delete folders which their modification time is older than a specific time.
You can try something along these lines
import shutil, os, time
top_dir = '/tmp'
now = time.time()
cutoff = now - (7 * 86400)
def del_old_files_and_dirs(top_dir, cutoff_time):
for root, dirs, files in os.walk(top_dir, topdown=False):
for cdir in dirs:
fdir = os.path.join(root, cdir)
if os.path.getmtime(fdir) < cutoff_time:
shutil.rmtree(fdir)
else:
# Process this dir again recursively
del_old_files_and_dirs(fdir, cutoff_time)
for cfile in files:
ffile = os.path.join(root, cfile)
if os.path.getmtime(ffile) < cutoff_time:
os.remove(ffile)
del_old_files_and_dirs(top_dir, cutoff)

How to batch process a folder of videos using MoviePy

I wrote a MoviePy script that takes an input video, does some processing, and outputs a video file. I want to run this through an entire folder of videos. Any help or direction is appreciated.
Here's what I tried...
for f in *; do python resize.py $f; done
and resize.py source code here:
from moviepy.editor import *
clip = VideoFileClip(input)
clip1 = clip.rotate(270)
clip2 = clip1.crop(x_center=540,y_center=960,width=1080,height=608)
clip3 = clip2.resize(width=1920)
clip3.write_videofile(output,codec='libx264')
Really wasn't sure what to put for "input" and "output" in my .py file.
Thanks,
Evan
I know you have an answer on Github, but I'll add my own solution.
First, you'll want to put your code inside a function:
def process_video(input):
"""Parameter input should be a string with the full path for a video"""
clip = VideoFileClip(input, output)
clip1 = clip.rotate(270)
clip2 = clip1.crop(x_center=540,y_center=960,width=1080,height=608)
clip3 = clip2.resize(width=1920)
clip3.write_videofile(output,codec='libx264')
Then, you can have a function that returns a list of file paths, and a list of final file names to use with the above function (note that the final file names will be the same as the original file names but with "output" in front):
import os
def get_video_paths(folder_path):
"""
Parameter folder_path should look like "Users/documents/folder1/"
Returns a list of complete paths
"""
file_name_list = os.listdir(folder_path)
path_name_list = []
final_name_list = []
for name in file_name_list:
# Put any sanity checks here, e.g.:
if name == ".DS_Store":
pass
else:
path_name_list.append(folder_path + name)
# Change the format of the output file names below
final_name_list.append(folder_path + "output" + name)
return path_name_list, final_name_list
Finally, at the bottom, we get the input folder, and utilise the above two functions:
if __name__ == "__main__":
video_folder = input("What folder would you like to process? ")
path_list, final_name_list = get_video_paths(video_folder)
for path, name in zip(path_list, final_name_list):
process_video(path, name)
print("Finished")
Just watch out, because this will crash if there are any files in the folder that can't be read as a movie. For instance, on mac, the OS puts a ".DS_Store" file in each folder, which will crash the program. I've put an area for a sanity check to ignore certain filenames.
Complete code:
import os
from moviepy.editor import *
def process_video(input, output):
"""Parameter input should be a string with the full path for a video"""
clip = VideoFileClip(input)
clip1 = clip.rotate(270)
clip2 = clip1.crop(x_center=540,y_center=960,width=1080,height=608)
clip3 = clip2.resize(width=1920)
clip3.write_videofile(output,codec='libx264')
def get_video_paths(folder_path):
"""
Parameter folder_path should look like "Users/documents/folder1/"
Returns a list of complete paths
"""
file_name_list = os.listdir(folder_path)
path_name_list = []
final_name_list = []
for name in file_name_list:
# Put any sanity checks here, e.g.:
if name == ".DS_Store":
pass
else:
path_name_list.append(folder_path + name)
final_name_list.append(folder_path + "output" + name)
return path_name_list, final_name_list
if __name__ == "__main__":
video_folder = input("What folder would you like to process? ")
path_list, final_name_list = get_video_paths(video_folder)
for path, name in zip(path_list, final_name_list):
process_video(path, name)
print("Finished")
I responded on your Github issue #542, but I copied it here for future reference!
First off, the below example isn't ironclad, but it should do what you need.
You can achieve this via something like this:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Convert all media assets located in a specified directory."""
import glob
import os
from optparse import OptionParser
from moviepy.editor import VideoFileClip
def get_dir_files(dir_path, patterns=None):
"""Get all absolute paths for pattern matched files in a directory.
Args:
dir_path (str): The path to of the directory containing media assets.
patterns (list of str): The list of patterns/file extensions to match.
Returns:
(list of str): A list of all pattern-matched files in a directory.
"""
if not patterns or type(patterns) != list:
print('No patterns list passed to get_dir_files, defaulting to patterns.')
patterns = ['*.mp4', '*.avi', '*.mov', '*.flv']
files = []
for pattern in patterns:
dir_path = os.path.abspath(dir_path) + '/' + pattern
files.extend(glob.glob(dir_path))
return files
def modify_clip(path, output):
"""Handle conversion of a video file.
Args:
path (str): The path to the directory of video files to be converted.
output (str): The filename to associate with the converted file.
"""
clip = VideoFileClip(path)
clip = clip.rotate(270)
clip = clip.crop(x_center=540, y_center=960, width=1080, height=608)
clip = clip.resize(width=1920)
clip.write_videofile(output, codec='libx264')
print('File: {} should have been created.'.format(output))
if __name__ == '__main__':
status = 'Failed!'
parser = OptionParser(version='%prog 1.0.0')
parser.add_option('-p', '--path', action='store', dest='dir_path',
default='.', type='string',
help='the path of the directory of assets, defaults to .')
options, args = parser.parse_args()
print('Running against directory path: {}'.format(options.dir_path))
path_correct = raw_input('Is that correct?').lower()
if path_correct.startswith('y'):
dir_paths = get_dir_files(options.dir_path)
for dir_path in dir_paths:
output_filename = 'converted_' + os.path.basename(dir_path)
modify_clip(path=dir_path, output=output_filename)
status = 'Successful!'
print('Conversion {}'.format(status))
With the above example, you can simply drop that into the directory of assets you wish to convert and run: python this_file.py and it should convert the files for you in the same directory with the name prepended with: converted_
Likewise, you can drop that file anywhere and run it against an absolute path:
python this_file.py -p /Users/thisguy/media and it will convert all files with the extensions: ['*.mp4', '*.avi', '*.mov', '*.flv']
Either way, let me know if you have any questions (or if this resolves your issue) and I'll do my best to help you out!
Thanks for using moviepy!

Categories

Resources