Find the large_files but output unexpected results

Find the large_files but output unexpected results - python

I have such a program to find the large files
import os, time, shelve
start = time.time()
root = '/'
# errors= set()
# dirs = set()
while True:
try:
root = os.path.abspath(root) #ensure its a abspath
#set the baseline as 100M
#consider the shift
baseline = 100 * 2**20 # 2*20 is1M
#setup to collect the large files
large_files = []
#root is a better choise as the a concept
for foldername, subfolders, files in os.walk(root):
for f in files:
# print(f"{foldername}, {f}")
abspath = os.path.join(foldername, f)
size = os.path.getsize(abspath)
if size >= baseline:
large_files.append((os.path.basename(abspath), size))
print(abspath, size/(2**20))
#write the large files to shelf
shelf = shelve.open('/root/large_files.db')
shelf["large_files"] = large_files
shelf.close()
if subfolders == []:
end = time.time()
break
except (PermissionError,FileNotFoundError) as e:
# errors.add(e)
pass
It consistently output the identical results
[root#iz2ze9wve43n2nyuvmsfx5z ~]# python3 search_large_files.py
/dev/core 134217726.0078125
/dev/core 134217726.0078125
/dev/core 134217726.0078125
....
However, I found no reasons that
print(abspath, size/(2**20))
will do this constantly.
What's the problem might be in my code:

You have an infinite outer loop with while True:, and apparently /dev/core is the only file in your filesystem that exceeds the file size specified by baseline, so it would keep outputting the same file over and over again.
Remove while True: and un-indent the block inside and your code would work.
Note that your if subfolders == []: condition is outside your for foldername, subfolders, files in os.walk(root): loop and would therefore not be useful. You should record the end time unconditionally anyway so you should simply remove the if condition and the break statement as well.

Related

Exclude hidden files in Python

Well, there's a thing I have to do: I have to count files with or without hidden files, with or without recursion, with certain extension or without (it's up to a user)(CLI). The problem is with hidden files.
My method:
if namespace.recursive == True:
for files in os.walk(top=namespace.path, topdown=True):
for i in files[2]:
countF += 1
print('Number of files in directory (with recursion): ', countF)
else:
p = Path(namespace.path)
for subdirs in p.iterdir():
if (subdirs.is_file()):
count += 1
print('Number of files in directory (without recursion): ', count)
counts files WITH the hidden ones.
What I want to do: I want this method to count files WITHOUT the hidden ones. But if a user inputs -h parameter, I want to count ONLY hidden files. So I tried to do a check-method for it:
def check_attributes(filename):
if(os.path.isfile(filename)):
return win32api.GetFileAttributes(filename) & win32con.FILE_ATTRIBUTE_HIDDEN
else:
return 0
and then I tried to modify my method and add after
for i in files[2]:
something like:
if check_attributes(f) == 0: #if it's not hidden - then count
But it still counts with hidden files. I want to understand how to do it right.
Thank you so much in advance for every answer!
EDIT: full function with checking
def countFiles():
countF = int(0)
count = int(0)
c = int(0)
try:
if namespace.extension == '.':
if namespace.recursive == True:
if namespace.hidden == False:
for files in os.walk(top=namespace.path, topdown=True):
for i in files[2]:
if check_attributes(i) == 0:
countF += 1
print('Number of files in directory (with recursion): ', countF)
else:
if namespace.hidden == False:
p = Path(namespace.path)
for subdirs in p.iterdir():
if (subdirs.is_file()):
count += 1
print('Number of files in directory (without recursion): ', count)
else:
if namespace.recursive == True:
for files in os.walk(namespace.path):
for f in files[2]:
if os.path.splitext(f)[1] == namespace.extension:
c += 1
print('Number if files with extension ' + namespace.extension + ' in directory (without recursion):', c)
else:
for files in os.listdir(namespace.path):
if os.path.splitext(files)[1] == namespace.extension:
c += 1
print('Number if files with extension ' + namespace.extension + ' in directory (without recursion): ', c)
except Exception as e:
print('Error:\n', e)
sys.exit(0)

In your original code, there are multiple boolean args creating different paths. Your extension == '.' path was the only one where check_attributes was being called from what I can tell, so that might have been the issue. I decided to take a crack at rewriting it. The way I rewrote it has 2 phases: 1. get the files, either recursively or not then 2. filter the files with the args provided. Here's what I came up with:
import argparse
import os
import win32api
import win32con
def count_files(args):
files = []
# Get the files differently based on whether recursive or not.
if args.recursive:
# Note here I changed how you're iterating. os.walk returns a list of tuples, so
# you can unpack the tuple in your for. current_dir is the current dir it's in
# while walking and found_files are all the files in that dir
for current_dir, dirs, found_files in os.walk(top=args.path, topdown=True):
files += [os.path.join(current_dir, found_file) for found_file in found_files]
else
# Note the os.path.join with the dir each file is in. It's important to store the
# absolute path of each file.
files += [os.path.join(args.path, found_file) for found_file in os.listdir(args.path)
if os.path.isfile(os.path.join(args.path, found_file))]
filtered_files = []
for found_file in files:
print(found_file)
if not args.hidden and (win32api.GetFileAttributes(found_file) & win32con.FILE_ATTRIBUTE_HIDDEN):
continue # hidden == False and file has hidden attribute, go to next one
if args.extension and not found_file.endswith(args.extension):
continue # File doesn't end in provided extension
filtered_files.append(found_file)
print(f'Length: {len(filtered_files)}')
return len(filtered_files)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process some integers.')
# Note that I took advantage of some other argparse features here like
# required vs optional arguments and boolean types
parser.add_argument('path')
parser.add_argument('--recursive', action='store_true', default=False)
parser.add_argument('--hidden', action='store_true', default=False)
parser.add_argument('--extension', type=str)
args = parser.parse_args()
count_files(args)

Need to upload sub-dirs and their contents, not just files in current dir

A script was supplied to me in order to upload files to a cloud bucket. You input the dir where the files you want to upload are and bingo bango, done.
What needs to happen is that there are additional sub dirs with their own files in them that I would like to transfer as well based on the input of the root dir. They would need to retain their tree structure relative to the root dir input.
Using the current code I get a write error/access denied fail. I know this is because the for loop is using os.listdir which can't parse the extra sub dirs and files but I'm not sure how to modify.
I attempted to get all the information I needed using os.walk and parsing that out. I verified with some print tests that it was looking in the right place for everything. However I hit a wall when I got this error when running the script:
folder\folder\lib\ntpath.py", line 76, in join
path = os.fspath(path)
TypeError: expected str, bytes or os.PathLike object, not list
I understand that something is being generated as a list when it shouldn't be but I'm not sure how to go about this...
This is the original script provided to me below. I have added the variable at the top just to be a little less abstract.
local_directory_path = 'C:\folder\folder\sync\FROM_LOCAL_UPLOAD'
def upload_folder_to_cloud(self, mount_id, local_directory_path):
''' This method will list every file at the local_directory_path and then for each,
it will call the api method athera.sync.upload_file for every file in your local directory
'''
_, destination_folder = os.path.split(local_directory_path)
if not destination_folder:
self.logger.error("Make sure the provided 'local_directory_path' does not end with a '/' or a '\\'")
sys.exit(2)
destination_folder = destination_folder + "/"
self.logger.info("Folder = {}".format(destination_folder))
for filename in os.listdir(local_directory_path):
destination_path = destination_folder + filename
filepath = os.path.join(local_directory_path, filename)
with open(filepath, "rb") as f:
_, err = self.client.upload_file(self.group_id, mount_id, f, destination_path=destination_path,)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
This is what I modified it to as a test:
for root, dirs, files in os.walk(local_directory_path):
srcFile = (os.path.join(files))
srcRoot = (os.path.join(root))
rootSplit = os.path.normpath(srcRoot).split(os.path.sep)
srcDirs = '/'.join(rootSplit[4:])
src = str('fixLocalFolder') + '/' + str(srcDirs) +'/'+ (files)
dst = str(srcDirs) + '/' + (files)
destination_folder = str(srcRoot) + "/"
destination_path = str(destination_folder) + str(srcFile)
filepath = os.path.join((str(srcDirs), str(srcFile)))
with open(filepath, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
I do not code for a living so I am sure I am not going about this the right way. I apologize for any code atrocities in advance. Thank you!

I do see some issues in that code, even without testing it. Something like the following might work for that loop. (Note! Untested!).
for root, dirs, files in os.walk(local_directory_path):
# Iterate through files in the currently processed directory
for current_file in files:
# Full path to file
src_file = os.path.join(root, current_file)
# Get the sub-path relative the original root.
sub_path = os.path.relpath(root, start=destination_folder)
# Get the destination path
destination_path = os.path.join(sub_path, current_file)
with open(src_file, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
I believe your central problem was misunderstanding what os.walk gives you. It gives you listing of each directory (and subdirectory), one after another.
Thus the values of one iterations might look like (when listing /mydir):
# First iteration:
root = "/mydir"
dirs = ["subdir", ...]
files = ["something.doc", "something else.txt"]
# Second iteration:
root = "/mydir/subdir"
dirs = ["sub-sub-dir1", ...]
files = ["file1.txt", "file2.txt", ...]

Select most recent edit of file between thumb drive and PC using python

I work on several computers and need to continuously update text and .py files on multiple devices.
I would like to run a program that scans the thumb drive I carry my work on and compare it to the folders on the parent directory at the main office. If there were any edits since last port, I want to upload the new file.
Comparing the sets of the two folders' files was my first indication of what to do. These lists are way longer than expected, since many of the files haven't changed since I moved them. I'm worried if I go with this approach, I would be recopying way more than needed each time I attempt this.
It seems like an easier approach either already exists (I don't know where to look) or I'm looking at the wrong piece of data (I don't know what to inspect for each file). Can you help?
My best attempt so far is as follows:
from os.path import basename as base
def fileWalkIn(path='.',matches=[],filt='*.csv'):
for root, dirnames, filenames in os.walk(path):
for filename in fnmatch.filter(filenames, filt):
matches.append(os.path.join(root, filename))
yield os.path.join(root, filename)
def main():
pys = r'K:\location\of\code\Python'
drives = ['D','E','F','G','H']
for usb in drives:
loc = usb + ":\\SDI"
if os.path.exists(loc): zDr = loc
ls = fileWalkIn(pys,[],filt='*.py')
timesHm = []
mchHm = []
for send in ls:
part = os.path.getmtime(send)
timesHm.append((base(send),part))
mchHm.append(send)
ls = fileWalkIn(zDr,[],filt='*.py')
timesDr = []
mchDr = []
for send in ls:
part = os.path.getmtime(send)
timesDr.append((base(send),part))
mchDr.append(send)
ls = list(set(timesDr) - set(timesHm))
newDr = []
newHm = []
for piece in ls:
for elem in timesHm:
if piece[0] == elem[0]: a = elem
for elem in timesDr:
if piece[0] == elem[0]: b = elem
if a[1] > b[1]:
newHm.append(piece)
else:
newDr.append(piece)
return newHm,newDr
if __name__ == "__main__":
newHm,newDr = main()

Purge script with mutltiple conditions

I'm writing yet another python purge script. This is replacing a very old bash script with tons of find -delete which take up to 9h to purge our video backend.
I know there is tons of those either on stack or right in google but thing is i have a few more constraints which left me to write what i find poor/unefficient code.
consider the following dir structure:
/data/channel1/video_800/0001/somefile_800_001.ts
/data/channel1/video_800/0001/somefile_800_002.ts
/data/channel1/video_800/0002/somediffile_800_001.ts
/data/channel1/video_800/0002/somediffile_800_002.ts
/data/channel1/video_800.m3u8
/data/channel1/video_900/0001/someotherfile_900_001.ts
/data/channel1/video_900/0002/afile_900_001.ts
/data/channel1/video_900/0003/bfile_900_001.ts
/data/channel1/video_900/0003/cfile_900_001.ts
/data/channel1/video_900.m3u8
/data/channel2/video_800/0001/againsomefile_800_001.ts
/data/channel2/video_800/0001/againsomefile_800_001.ts
/data/channel2/video_800.m3u8
/data/sport_channel/video_1000/0001/somefile.ts
/data/sport_channel/video_1000/0001/somefile2.ts
First thing that interests me is the channel name since there is a rule for channel* and one for sport*.
Second thing is the end of the video dirs that equals the bitrate... 800, 900, 1000 since these can have different retention days.
Finaly i'm going through everything and remove files based on bitrate and extention.
The bellow code works but is overly complicated and i'm sure not very pythonic. Since what i care most in this case is performance i'm sure there is a more efficient way to do this. Stacking for loop in for loop is not only poor design but also gets me a 'find_files' is too complex [mccabe] in my pymode.
** Left the remove function out of the code example but it's just a plain try:except using os.rmdir and os.remove
I'm open to all suggestions to improving my code.
Thanks!
#!/usr/bin/python
import os
import time
import fnmatch
path = '/data'
debits_short = ['200', '700', '1000', '1300', '2500']
debits_long = ['400', '1800']
def find_files(chan_name, debits, duration):
time_in_secs = time.time() - (duration * 24 * 60 * 60)
# List channel
for channel in os.listdir(path):
# Match category channels
if fnmatch.fnmatch(channel, chan_name):
# Go through bitrates
for debit in debits:
# Channel path now the default search path
channel_path = path + channel
# Walk through channel path to match bitrate files
for root, dirs, files in os.walk(channel_path, topdown=False):
for filename in files:
# Remove files that contain _bitrate_ and end with ts
if '_' + debit + '_' in filename:
if filename.endswith('.ts'):
if os.path.isfile(os.path.join(root, filename)):
if os.stat(os.path.join(root, filename)).st_mtime <= time_in_secs:
remove(os.path.join(root, filename))
# Remove playlist files that contain bitrate.m3u8
if filename.endswith(debit + '.m3u8'):
if os.path.isfile(os.path.join(root, filename)):
if os.stat(os.path.join(root, filename)).st_mtime <= time_in_secs:
remove(os.path.join(root, filename))
# Remove empty dirs
for dir in dirs:
if not os.listdir(os.path.join(root, dir)):
remove(os.path.join(root, dir))
find_files('channel*', debits_long, 3)
find_files('sport*', debits_short, 7)

Here's a possible approach:
import os
import glob
import time
class Purge(object):
removable_extensions = ['ts', 'm3u8']
def __init__(self, basedir, channel_pattern, debits,
older_than_days, test_mode=False):
self.basedir = basedir
self.channel_pattern = channel_pattern
self.debits = debits
self.older_than_secs = time.time() - 24*60*60*older_than_days
self.test_mode = test_mode # If `True`, do not delete files.
def delete_file(self, filepath):
try:
os.remove(filepath)
except OSError:
pass
def file_for_deletion(self, filepath):
# Return `True` if a file meets all conditions for deletion.
filename, ext = os.path.splitext(os.path.basename(filepath))
condition_ext = ext[1:] in self.removable_extensions
condition_old = os.stat(filepath).st_mtime <= self.older_than_secs
condition_deb = any(
'_{}_'.format(d) in filename or filename.endswith(d)
for d in self.debits
)
return all((condition_ext, condition_old, condition_deb))
def purge_channel(self, channel_dir):
for root, dirs, files in os.walk(channel_dir):
for name in files:
filepath = os.path.join(root, name)
if self.file_for_deletion(filepath):
print filepath
if not self.test_mode:
self.delete_file(filepath)
#TODO: delete empty directories here.
def purge(self):
channels = glob.glob(os.path.join(self.basedir, self.channel_pattern))
for channel_dir in channels:
self.purge_channel(channel_dir)
if __name__ == '__main__':
purge_job_info = dict(
basedir=r'path/to/data', # All channel folders live here.
channel_pattern='channel*', # `glob` pattern.
debits=['400', '1800'],
older_than_days=7,
)
p = Purge(**purge_job_info)
p.test_mode = True
p.purge()

Python script to transfer certain files after intervals

I have a python code which transfers files from folder a to folder b
In that folder if there are many files (30 for example) I need to transfer only 5 files at a time,
Following is the code
#!/usr/bin/python
import os, sys, time
src_path = "/opt/tst1/"
dst_path = "/opt/tst1/consume/"
now = time.time()
cutoff = now - (5 * 60)
count = 5
files = os.listdir(src_path)
for f in files:
fn = src_path + f
if not os.path.isfile(fn):
continue
t = os.stat(fn)
c = t.st_ctime
if c > cutoff:
continue
# move the file
dfn = dst_path + f
os.rename(fn, dfn)
count -=1
if count == 0:
Break
It copies the entire contents of the folder from 1 folder to another, as opposed to copy only 5 files at the time, is there anything that needs to be added

this code will send 5 files at a time until all files are exausted
files = os.listdir(".")
while files:
print "COPY 5"
for i in range(5):
try:
next_file = files.pop() #get the next file if we can
except IndexError:
print "DONE!" #if we cant we are done
break
print next_file
do_something(next_file)
print "Resting"
time.sleep(however_long)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find the large_files but output unexpected results - python

Related

Exclude hidden files in Python

Need to upload sub-dirs and their contents, not just files in current dir

Select most recent edit of file between thumb drive and PC using python

Purge script with mutltiple conditions

Python script to transfer certain files after intervals

Categories

Resources