I have a folder with multiple PDFs with datestamps at the end of their names e.g.
hello_20200820.pdf
hello_20200821.pdf
hello_20200822.pdf
hello_20200717.pdf
I am trying to write a function to remove all the PDFs in the folder other than the TWO most recent pdf's.
The code I have written however is deleting the only the 3rd most recent file or the oldest file if there are less than 3. How can I fix this and remove ALL pdfs with the name 'hello' other than the two most recent?
Here is my code so far:
def remove_old_pdf(wsp, folder):
date_diff = float('inf')
today = datetime.now()
filename = ''
files = os.listdir('PDFs/' + folder)
# print(files)
for file in files:
if file.endswith('.pdf') and wsp in file:
date_str = file.split('_')[1].split('.')[0]
curr_diff = today - datetime.strptime(date_str, '%Y%m%d')
if date_diff == float('inf') or curr_diff < date_diff:
date_diff = curr_diff
filename = file
# print(filename)
else:
pass
print(filename)
files.remove(filename)
# print(files)
for file in files:
if file.endswith('.pdf') and wsp in file:
date_str = file.split('_')[1].split('.')[0]
curr_diff = today - datetime.strptime(date_str, '%Y%m%d')
filename = file
else:
pass
if filename in files:
files.remove(filename)
print(filename)
else:
print('lol')
# print(files)
for file in files:
if file.endswith('.pdf') and wsp in file:
date_str = file.split('_')[1].split('.')[0]
curr_diff = today - datetime.strptime(date_str, '%Y%m%d')
filename = file
else:
pass
delFile = 'PDFs/' + folder + '/' + filename
finalFiles = os.listdir('PDFs/' + folder)
if filename in finalFiles:
os.remove('PDFs/' + folder + '/' + filename)
print('Deleted ' + filename +'.')
else:
print("No PDFs deleted")
You can use glob to list all the files that match, restrict to the first n-2 and delete those:
import os
from glob import glob
dryrun = True # change this to False to actually delete
wc = 'hello_????????.pdf'
for name in sorted(glob(wc))[:-2]:
print(f'delete {name}{" (DRY-RUN)" if dryrun else ""}')
if not dryrun:
os.unlink(name)
Note: personally I always prefer to have globs that are as strict as possible. So I often define something like:
wildcards = {
'Y': '[12][0-9][0-9][0-9]',
'm': '[01][0-9]',
'd': '[0-3][0-9]',
'H': '[0-2][0-9]',
'M': '[0-5][0-9]',
'S': '[0-5][0-9]',
}
# and then:
ymdglob = ''.join([wildcards[datepart] for datepart in 'Ymd'])
wc = f'hello_{ymdglob}.pdf'
# etc.
Related
I have the following files in txt format:
Expected File Format: I want to remove prefix from file name that is 1. a1. and while renaming if the file already present with same name then append _1, _2 to the file as given below in example.
My try:
import os
import re
import shutil
import argparse
pattern = "a1"
path = "/Users/a1/Documents/Files"
count = 0
p = ".* "+str(pattern)+".(.+)"
for root, dirs, files in os.walk(path):
for file in files:
m = re.match(p, file)
if m is not None:
file_new = m.group(1)
if not os.path.exists(os.path.join(root,file_new)):
os.rename(os.path.join(root, file), os.path.join(root,file_new))
else:
count = count + 1
file_new = m.group(1)+"_"+str(count)
os.rename(os.path.join(root, file), os.path.join(root,file_new))
And this is what the output I'm getting:
You can use Dict for saving the count of repeating each file_name and use saving count in Dict for renaming.
import os
import re
pattern = "a1"
path = "Files/"
dct = {} # <- adding this
for root, dirs, files in os.walk(path):
for file in files:
if pattern in file:
file_new = file.split(pattern, 1)[1]
if not file_new in dct: # <- adding this
os.rename(os.path.join(root, file),
os.path.join(root,file_new[1:]))
dct[file_new] = 1 # <- adding this
else:
num = dct[file_new] # <- adding this
dct[file_new] += 1 # <- adding this
file_name, file_type = file_new[1:].split('.')
os.rename(os.path.join(root, file),
os.path.join(root, f'{file_name}_{num}.{file_type}'))
Filename before renaming:
Filename after renaming:
I have a python script that list folder and files existing in a given path.
What i want is to be able to check if the existing folder start with string "pdf" + date
like this: pdf 18-19-06-2020. If the folder start with just pdf and the date is not in the format "dd-dd-mm-yyyy" i need to convert the name to the required format.
I am getting the current date and the date of yesterday.
code:
#packages for list and copy folders & files.
import calendar
import os
import shutil
from os import path
from datetime import date
def main():
copy(src)
'''
FUNCTION THAT calculate current date and 2 dates before
'''
def yesterday():
days=[]
day = int(date.today().strftime("%d"))
month = int(date.today().strftime("%m"))
year = int(date.today().strftime("%Y"))
if day != 1:
p = day -1
p1 = day -2
p2 = day -3
print("******",p)
print("******",p1)
print("******",p2)
days.append(p)
days.append(p1)
days.append(p2)
print("******",days,"********")
return p
else:
p = 32 -1
print("******",p)
return p
long_months = [1, 3, 5, 7, 8, 10, 12]
if month in long_months:
print(32 -1)
return(32-1)
elif month == 2:
if calendar.isleap(year):
return 29
return 28
else:
return 30
dst = "E:/KRD2018_Data"
dst2 = "F:/ABpro"
dst3 = "C:/Users/cvd/Documents"
'''
FUNCTION THAT list the folders and files exist on the USB drive and copy the pdfs and docs to their destinations
and copy the pdfs in the existing folder to the specified destination
'''
def copy(src):
#name = folder pdf yesterday + today
#pdf dd-dd-mm-yyyy ==> 3-04-05-2020
datefile = "pdf " + str(yesterday()) + date.today().strftime("-%d-%m-%Y")
src2 = os.path.join(src, datefile)
ignore_list=["$RECYCLE.BIN","System Volume Information"]
i=0
j=0
z=0
for dirpath, dirnames, files in os.walk(src, topdown=True):
print(f'Found directory: {dirpath}')
if len(dirnames)==0 and len(files)==0:
print("this directory is empty")
continue
# exclude the ignore list from the os.walk
dirnames[:] = [d for d in dirnames if d not in ignore_list]
# check if the path is directory
isdir = os.path.isdir(dirpath)
print(isdir)
for file in files:
full_file_name = os.path.join(dirpath, file)
if os.path.join(dirpath) == src:
if file.endswith("pdf"):
if not os.path.exists(dst2):
os.mkdir(dst2)
else:
print("the path alredy exist")
# shutil.copy(full_file_name, dst2)
i+=1
elif file.endswith("docx") or file.endswith("doc"):
# shutil.copy(full_file_name, dst)
j+=1
elif os.path.join(dirpath)== src2:
if file.endswith("pdf"):
numfile = len(files)
# shutil.copy(full_file_name, dst3)
z+=1
print("*******number of directories = {}".format(len(dirnames)))
print("*******number of files = {}".format(len(files)))
print("{} word file \n".format(j))
print("{} pdf files \n".format(z))
print("{} other files \n".format(i))
print("total copied files {}".format(i+j+z))
if __name__=="__main__":
main()
I wrote a Python script that collects file metadata (filename, creation date, creation time, last modified data, last modified time) from a file directory. However, when the directory is a path that is located in an external hard drive the script doesn't work. I can't figure out why.
Here is the code:
import os
from os.path import basename
import datetime
import time
def getSize(filename):
st = os.stat(filename)
print st
return st.st_size
#get last modified date
def getMTime(filename):
fileModTime = os.path.getmtime(filename)
return fileModTime
#get creation date
def getCTime(filename):
fileModTime = os.path.getctime(filename)
return fileModTime
#get data from directory
MyDirectory = "H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
MyExtension = ".jpg"
#write to file
WorkingDirectory = "C:\\Users\Admin\Downloads\demo\\"
MyTxtFile = WorkingDirectory + "fileData6.txt"
delim = ";"
with open(MyTxtFile, 'wb') as f:
f.write(delim.join(["FILENAME", "FILESIZE", "mDATE","mTIME",
"cDATE","cTIME"]) + "\n")
for root, dirs, files in os.walk(MyDirectory):
for file in files:
if file.endswith(MyExtension):
#get File Name
a = (os.path.join(root, file))
#print a
filename = a
MyFileName = basename(a)
#get File Size
MyFileSize = getSize(filename) / 1000
print MyFileName + " >>> file size: " + str(MyFileSize) + "Kb"
#get modification time V2
modTimeV2 = getMTime(filename)
modTimeV2 = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(modTimeV2))
print "time modified: " + str(modTimeV2)
#get creation time
creTime = getCTime(filename)
creTime = time.strftime("%Y/%d/%m;%I:%M:%S %p", \
time.localtime(creTime))
print "time created: " + str(creTime)
#--------
#write data to file
entry = delim.join([str(MyFileName), str(MyFileSize), \
str(modTimeV2), str(creTime)]) + "\n"
f.write(entry)
print "<<<<<<everything went fine>>>>>>"
Your code works fine for me. Your "MyDirectory" variable has escape characters in it. Try adding an r in front of the quotations:
MyDirectory = r"H:\0_tempfiles\150115_Portfolio\Work\Work\BarBackUp"
or
MyDirectory = "H:/0_tempfiles/150115_Portfolio/Work/Work/BarBackUp"
or
MyDirectory = "H:\\0_tempfiles\\150115_Portfolio\\Work\\Work\\BarBackUp"
I have the following script -
import os
import stat
import zipfile
from datetime import datetime, timedelta
import logging
logfile = 'D:\\logfiles\\MasterLogsArchive\\archive.log'
logging.basicConfig(filename=logfile, format='%(asctime)s %(message)s', level=logging.DEBUG)
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
modes = { zipfile.ZIP_DEFLATED: 'deflated',
zipfile.ZIP_STORED: 'stored',
}
def modified_date(filename):
return datetime.fromtimestamp(os.stat(filename)[stat.ST_MTIME])
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
move_date = datetime.now() - timedelta(minutes = 2)
src = "D:\\program files (x86)\\TIDAL\\Scheduler\\Master\\log"
for filename in os.listdir(src):
full_filename = os.path.join(src, filename)
scheduler = os.path.join(src, 'scheduler.out')
if modified_date(full_filename) < move_date and filename.startswith('Master'):
filedate = find_between(filename, '-', '.')[:-7]
date = filedate[:-2]
year = filedate[:-6]
month = filedate[4:-4]
day = filedate[6:-2]
hour = filedate[8:]
dest = "D:\\logfiles\\MasterLogsArchive\\" + date
if not os.path.exists(dest):
os.makedirs(dest)
zf = dest + '\\' + 'Master%s%s%s-%s.zip' % (year, month, day, hour)
## add Master Logs
if (os.path.isfile(full_filename)):
if (os.path.isfile(zf)):
try:
logging.info('%s is archived' % full_filename)
zip = zipfile.ZipFile(zf, mode='a')
zip.write(full_filename, compress_type=compression)
os.remove(full_filename)
finally:
zip.close()
else:
try:
logging.info('%s is archived' % full_filename)
zip = zipfile.ZipFile(dest + '\\' + 'Master%s%s%s-%s.zip' % (year, month, day, hour), mode='w')
zip.write(full_filename, compress_type=compression)
os.remove(full_filename)
finally:
zip.close()
The problem I'm having is that in compression it's doing the full path which I don't want. I only want the file in the zip. If I change the zip.write to do 'filename' instead of 'full_filename' it then complains it can't find the file.
So how do I get the write to know what folder to grab the file out of?
The actual write needs to be changed to the following -
zip.write(full_filename, os.path.basename(full_filename), compress_type=compression)
Got the answer from here -
How can I zip file with a flattened directory structure using Zipfile in Python?
I want to download some files and save them in a folder and there may be some duplication in file names, so I want to avoid this to happen.
I think it needs an auto-naming system but now i don't know how to make it.
I used shutil and urllib2 to write my function.
This is a part of my code :
path = 'C:/DL/Others/'+filename+file_ext
with open(path, 'wb') as fp:
shutil.copyfileobj(req, fp)
As you know we can check that if a file exists or not by os.path.exists('path').
I wanna to rename my files and save them to avoid duplicated names using a pattern, for example by adding a number to file name.So if there was 4 files with same name, "fname", I want 4 files in this pattern :
fname - fname(1) - fname(2) - fname(3)
Something like this is probably reasonable:
path = 'c:/DL/Others/%s%s' % (filename, file_ext)
uniq = 1
while os.path.exists(path):
path = 'c:/DL/Others/%s_%d%s' % (filename, uniq, file_ext)
uniq += 1
If the original path doesn't exist you get no _1, but if it does exist it'll count up until it finds one that's free.
Track each filename's count as you create it:
fname_counts = {}
# ... whatever generates filename and file_ext goes here...
if filename + file_ext in fname_counts:
fname_counts[filename + file_ext] += 1
else:
fname_counts[filename + file_ext] = 0
# now check if it's a dupe when you create the path
if fname_counts[filename + file_ext]:
path = 'C:/DL/Others/%s_%s.%s' % (filename, fname_counts[filename + file_ext], file_ext)
else:
path = 'C:/DL/Others/' + filename + file_ext
Example at work with two duplicates ("test.txt"):
>>> filenames_and_exts = [('test', '.txt'), ('test', '.txt'), ('test2', '.txt'), ('test', '.cfg'), ('different_name', '.txt')]
>>> fname_counts = {}
>>> for filename, file_ext in filenames_and_exts:
if filename + file_ext in fname_counts:
fname_counts[filename + file_ext] += 1
else:
fname_counts[filename + file_ext] = 0
if fname_counts[filename + file_ext]:
path = 'C:/DL/Others/%s_%s%s' % (filename, fname_counts[filename + file_ext], file_ext)
else:
path = 'C:/DL/Others/' + filename + file_ext
print path
C:/DL/Others/test.txt
C:/DL/Others/test_1.txt
C:/DL/Others/test2.txt
C:/DL/Others/test.cfg
C:/DL/Others/different_name.txt