Comparing file creation date - python

I am trying to archive old files based on creation date. I have data starting from 12-17-2010 so i am setting this as base date and incrementing from there. Here is my code
import os, time, tarfile
from datetime import datetime, date, timedelta
import datetime
path = "/home/appins/.scripts/test/"
count = 0
set_date = '2010-12-17'
date = datetime.datetime.strptime(set_date, '%Y-%m-%d')
while (count < 2):
date += datetime.timedelta(days=1)
tar_file = "nas_archive_"+date.strftime('%m-%d-%y')+".tgz"
log_file = "archive_log_"+date.strftime('%m-%d-%y')
fcount = 0
f = open(log_file,'ab+')
#print date.strftime('%m-%d-%y')
for root, subFolders, files in os.walk(path):
for file in files:
file = os.path.join(root,file)
file = os.path.join(path, file)
filecreation = os.path.getctime(file)
print datetime.fromtimestamp(filecreation)," File Creation Date"
print date.strftime('%m-%d-%y')," Base Date"
if filecreation == date:
tar.add(file)
f.write(file + '\n')
print file," is of matching date"
fcount = fcount + 1
f.close()
count += 1
filecreation variable is getting float value. How can I use it to compare with my base date?

timestamp = datetime.mktime(date.timetuple())
The 'timestamp' will contain a timestamp comparable to values returned by getctime. Regarding the comment under the question: on Windows getctime returns creation time, on UNIXes modification time (http://docs.python.org/3.1/library/os.path.html).
EDIT (regarding questions in comment):
1) mktime is present in Python 2.x: http://docs.python.org/2/library/time.html#time.mktime
2) Get file creation time with Python on linux
EDIT2:
Obviously this is stupid, and one should proceed as suggested by tdelaney below:
date.fromtimestamp(filecreation)
and compare dates, not timestamps. I wasn't looking at what the algorithm was actually doing :)

Related

How to select specific csv files for specified date range from a folder in python?

I have a folder (existing in the same directory as the python script) with a lot of csv files starting from 1st Jan to 31st Dec and I want to read only specific csv files within a certain date range from the folder into python and later appending the files into a list.
The files are named as below and there are files for each day of multiple months:
BANK_NIFTY_5MINs_2020-02-01.csv, BANK_NIFTY_5MINs_2020-02-02.csv, ... BANK_NIFTY_5MINs_2020-02-28.csv, BANK_NIFTY_5MINs_2020-03-01, .... BANK_NIFTY_5MINs_2020-03-31 and so on.
Currently, I have the code to fetch the csv files of the whole month of March by using the 'startswith' and 'endswith' syntax. However, doing this allows me to target files for only one month at a time.
I want to be able to read multiple months of csv files in within a specified date range for example Oct, Nov and Dec or Feb and March (Basically start and end at any month).
The following code gets only the files for March. I then fetch the files from the list and merge it into a dataframe.
#Accessing csv files from directory
startdate = datetime.strptime("2022-05-01", "%Y-%m-%d")
enddate = datetime.strptime("2022-06-30", "%Y-%m-%d")
all_files = []
path = os.path.realpath(os.path.join(os.getcwd(),os.path.dirname('__file__')))
for root, dirs, files in os.walk(path):
for file in files:
if file.startswith("/BANK_NIFTY_5MINs_") and file.endswith(".csv"):
file_date = datetime.strptime(os.path.basename(file), "BANK_NIFTY_5MINs_%Y-%m-%d.csv")
if startdate <= file_date <= enddate:
all_files.append(os.path.join(root, file))
Output of the above looks :
'BANK_NIFTY_5MINs_2020-03-01.csv' and so on
but should be the entire path, for example:
'c:\Users\User123\Desktop\Myfolder\2020\BANK\BANK_NIFTY_5MINs_2020-03-01.csv'.
The merge function requires the complete path in list to be in this format to process further.
I would have a different approach for more flexibility
import os
from datetime import datetime
from pprint import pprint
def quick_str_to_date(s: str) -> datetime:
return datetime.strptime(s, "%Y-%m-%d")
def get_file_by_date_range(path: str, startdate: datetime or str, enddate: datetime or str) -> list:
if type(startdate) == str:
startdate = quick_str_to_date(startdate)
if type(enddate) == str:
enddate = quick_str_to_date(enddate)
result = []
for root, dirs, files in os.walk(path):
for filename in files:
if filename.startswith("BANK_NIFTY_5MINs_") and filename.lower().endswith(".csv"):
file_date = datetime.strptime(os.path.basename(filename), "BANK_NIFTY_5MINs_%Y-%m-%d.csv")
if startdate <= file_date <= enddate:
result.append(filename)
return result
print("all")
pprint(get_file_by_date_range("/full/path/to/files", "2000-01-01", "2100-12-31"))
print("\nfebuari")
pprint(get_file_by_date_range("/full/path/to/files", "2020-02-01", "2020-02-28"))
print("\none day")
pprint(get_file_by_date_range("/full/path/to/files", "2020-02-01", "2020-02-01"))
output
all
['/full/path/to/files/BANK_NIFTY_5MINs_2020-02-02.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-02-28.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-03-01.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-03-31.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-02-01.csv']
febuari
['/full/path/to/files/BANK_NIFTY_5MINs_2020-02-02.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-02-28.csv',
'/full/path/to/files/BANK_NIFTY_5MINs_2020-02-01.csv']
one day
['/full/path/to/files/BANK_NIFTY_5MINs_2020-02-01.csv']
If you want to do it with regex, here it is:
# replace `file.startswith(...) and file.endswith(...)`
re.match('BANK_NIFTY_5MINs_2020-(02|03|10|11|12)-[0-9]+', file)
### ^^^^^^^^^^^^^^ Feb, Mar, Oct-Dec
It's the most basic one to get you going, it might be improved.
But in your case I'd go with simple glob:
all_files = glob.glob('./BANK_NIFTY_5MINs_2020-0[2-3]-*.csv')

How to check which string is the most recent calendar date in Python

I have a list strings of filenames and their names all end with a date eg. hello_20200825.pdf, hello_20200720 etc. How can I strip the end of the string to get the date, and then check which string in the list has the most recent date and return that string? Here is my code so far:
import os
import datetime
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
for i in files:
print(files)
If you don't want to use the datetime component you can play with formatted dates and lists.
import os
import datetime
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
dates = [] #list of dates
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
for i in files:
dateFromString = i.split('_')[1].split('.')[0] #get the numbers from file names
dates.append(dateFromString)
latestFile = max(dates) #returns the max number from your dates which will be the latest
#another approch will be to get the latest number from dates list - dates.reverse() and then return dates[0]
print(latestFile)
here is code with the date component
for i in files:
dateFromString = i.split('_')[1].split('.')[0]
date = datetime.datetime.strptime(dateFromString, '%Y%m%d')
dates.append(date)
print(max(dates)) # it will print the max date value - which will be your latest file
The module Delorean has a parse method that's good at handling random date formats, and/or you could use the regex below to strip non-numerical characters in the above solutions for parsing, too. If you cast the filename to a tuple along with the datetime once you get it, you can still use max and return the filename by its tuple index.
import os
import datetime
import re
from delorean import parse
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
# create a blank list
result_list = []
for i in files:
# remove everything that's not a digit with regex
digits_only = re.sub("[^0-9]", "", i)
# parse the remaining digits and return a datetime
parsed = parse(digits_only).datetime
# add a tuple with the datetime and filename to the list
result_list.append((parsed, i))
# get filename element from max result
most_recent_filename = max(result_list)[1]
You can split the file name by _, grab the date from the 1st index and parse it using datetime.strptime - then it's just simple maths
import os
from datetime import datetime
def most_recent_file(region, wsp):
path = r'PDFs/' + region
# Set date_diff to the highest value
date_diff = float('inf')
today = datetime.now()
# Variable to store the filename to
filename = ''
for file in os.listdir(path):
# Split the file by _ and grab the result at 1st index
# Split that by `.` and grab the result at 0 index
# This will be the date
date_str = file.split('_')[1].split('.')[0]
# Parse the string with the format YYYYMMDD
# Then get the diff between today and the parsed time
curr_diff = today - datetime.strptime(date_str, '%Y%m%d')
if date_diff == float('inf') or curr_diff < date_diff:
# If the difference is less than the date_diff - update date_diff
date_diff = curr_diff
# Also set the filename
filename = file
return filename

Python - rename files incrementally based on julian day

Problem:
I have a bunch of files that were downloaded from an org. Halfway through their data directory the org changed the naming convention (reasons unknown). I am looking to create a script that will take the files in a directory and rename the file the same way, but simply "go back one day".
Here is a sample of how one file is named: org2015365_res_version.asc
What I need is logic to only change the year day (2015365) in this case to 2015364. This logic needs to span a few years so 2015001 would be 2014365.
I guess I'm not sure this is possible since its not working with the current date so using a module like datetime does not seem applicable.
Partial logic I came up with. I know it is rudimentary at best, but wanted to take a stab at it.
# open all files
all_data = glob.glob('/somedir/org*.asc')
# empty array to be appended to
day = []
year = []
# loop through all files
for f in all_data:
# get first part of string, renders org2015365
f_split = f.split('_')[0]
# get only year day - renders 2015365
year_day = f_split.replace(f_split[:3], '')
# get only day - renders 365
days = year_day.replace(year_day[0:4], '')
# get only year - renders 2015
day.append(days)
years = year_day.replace(year_day[4:], '')
year.append(years)
# convert to int for easier processing
day = [int(i) for i in day]
year = [int(i) for i in year]
if day == 001 & year == 2016:
day = 365
year = 2015
elif day == 001 & year == 2015:
day = 365
year = 2014
else:
day = day - 1
Apart from the logic above I also came across the function below from this post, I am not sure what would be the best way to combine that with the partial logic above. Thoughts?
import glob
import os
def rename(dir, pattern, titlePattern):
for pathAndFilename in glob.iglob(os.path.join(dir, pattern)):
title, ext = os.path.splitext(os.path.basename(pathAndFilename))
os.rename(pathAndFilename,
os.path.join(dir, titlePattern % title + ext))
rename(r'c:\temp\xx', r'*.doc', r'new(%s)')
Help me, stackoverflow. You're my only hope.
You can use datetime module:
#First argument - string like 2015365, second argument - format
dt = datetime.datetime.strptime(year_day,'%Y%j')
#Time shift
dt = dt + datetime.timedelta(days=-1)
#Year with shift
nyear = dt.year
#Day in year with shift
nday = dt.timetuple().tm_yday
Based on feedback from the community I was able to get the logic needed to fix the files downloaded from the org! The logic was the biggest hurdle. It turns out that the datetime module can be used, I need to read up more on that.
I combined the logic with the batch renaming using the os module, I put the code below to help future users who may have a similar question!
# open all files
all_data = glob.glob('/some_dir/org*.asc')
# loop through
for f in all_data:
# get first part of string, renders org2015365
f_split = f.split('_')[1]
# get only year day - renders 2015365
year_day = f_split.replace(f_split[:10], '')
# first argument - string 2015365, second argument - format the string to datetime
dt = datetime.datetime.strptime(year_day, '%Y%j')
# create a threshold where version changes its naming convention
# only rename files greater than threshold
threshold = '2014336'
th = datetime.datetime.strptime(threshold, '%Y%j')
if dt > th:
# Time shift - go back one day
dt = dt + datetime.timedelta(days=-1)
# Year with shift
nyear = dt.year
# Day in year with shift
nday = dt.timetuple().tm_yday
# rename files correctly
f_output = 'org' + str(nyear) + str(nday).zfill(3) + '_res_version.asc'
os.rename(f, '/some_dir/' + f_output)
else:
pass

Convert YYYYMMDD filename to YYYYJD

I'm trying to write a python script to convert a folder of .asc files (365 files for every year in different folders organized by year) that have the yearmonthdate in their filename to have the yearjuliandate instead and the julian date needs to be 3 digits (ie 1 = 001).
The format they are in: ETos19810101.asc.
I want them to be as: ETos1981001.asc
How do I write this in Python where I can iterate over each file and convert it to the correct julian day?
I'm trying to write a Python script to convert a folder of .asc files (365 files for every year in different folders organized by year) that have the yearmonthdate in their filename to have the yearjuliandate instead and the julian date needs to be 3 digits (ie 1 = 001).
The format they are in: ETos19810101.asc
I want them to be as: ETos1981001.asc
How do I write this in Python where I can iterate over each file and convert it to the correct julian day?
I have this so far:
import os.path, os, glob
for filename in glob.glob(filepath + "/*.asc"):
jdate = '%03d' %doy #creates 3 digit julian date
doy = doy + 1
filename.replace(int[-8:-4], jdate + 1)
Given a file name as following (you can iterate your file system with os.walk)
filename = 'ETos19810101.asc'
First of all you have to split the filename to get every significant parts:
import os
name, ext = os.path.splitext(filename)
prefix = name[0:-6] # negative prefix => string end as reference
strdate = name[-6:]
Then you can parse the date:
from datetime import datetime
date = datetime.strptime(strdate, '%Y%m%d')
Now you are able to join everything together (%Y%j format the date the way you want):
newfilename = '{prefix}{date:%Y%j}{ext}'.format(prefix=prefix, date=date, ext=ext)
Finally rename the file:
os.rename(filename, newfilename)
Note that the last instruction will fail if newfilename already exists.
To fix this issue you have to remove the file, if it exists:
if os.path.exists(newfilename):
os.remove(newfilename)
os.rename(filename, newfilename)
Use the '%j specifier along with datetime.strptime and os.rename and the various os.path commands:
from datetime import datetime
from glob import glob
import os
for filename in glob(os.path.join(filepath, 'ETos*.asc')):
try:
dt = datetime.strptime(os.path.basename(filename), 'ETos%Y%m%d.asc')
except ValueError as e:
continue # rest of file name didn't have valid date - do the needful
os.rename(filename, os.path.join(filepath, format(dt, 'ETos%Y%j.asc')))
You'll probably want a bit of handling around that and adjust to take into account your path, but that's the general principle.
For working with dates you should use the datetime module. Parse the date string with strptime. There's no function to return a julian date, but it's easy to create one:
def julian_day(dt):
jan1 = dt.replace(month=1, day=1)
return 1 + (dt - jan1).days

Comparing File Dates in a Directory

I am trying to write a script in Python to upload a series of photos depending on the dates they were created. I am having an issue of comparing the dates of each of the files to a date before and after the dates I want so that I can create an array to loop through for my uploading. Here is what I have:
from stat import S_ISREG, ST_CTIME, ST_MODE
import os, sys, time, datetime
array = []
area = "/home/user/blah"
# Edit the path to match your desired folder between the ""
os.chdir(area)
retval = os.getcwd()
# Puts you in the desired directory
dirpath = sys.argv[1] if len(sys.argv) == 2 else r'.'
entries = (os.path.join(dirpath, fn) for fn in os.listdir(dirpath))
entries = ((os.stat(path), path) for path in entries)
entries = ((stat[ST_CTIME], path)
for stat, path in entries if S_ISREG(stat[ST_MODE]))
for cdate, path in sorted(entries):
filedate = time.ctime(cdate)
if filedate < datetime.date(2015,03,13) and filedate > datetime.date(2015,02,17):
print time.ctime(cdate)
print os.path.basename(path)
Is there a way to do this with ctime or is there a better way?
ctime return the string representation, if you want to compare with time, you should compare the timestamp or datetime class.
for cdate, path in sorted(entries):
# compare by timestamp
#if cdate < time.mktime(datetime.date(2015,03,13).timetuple()) and \
# cdate > time.mktime(datetime.date(2014,02,17).timetuple()):
# compare by datetime
filedate = datetime.datetime.fromtimestamp(cdate)
if filedate < datetime.datetime(2015,03,13) and \
filedate > datetime.datetime(2014,02,17):
print time.ctime(cdate)
print os.path.basename(path)
There's no real need to os.chdir() here. Dealing with absolute filenames is fine. You can simplify the selection criteria using a list-comp, datetime, os.path.isfile and os.path.getctime, eg:
import os
from datetime import datetime
files = [
fname
for fname in sorted(os.listdir(dirpath))
if os.path.isfile(fname) and
datetime(2015, 2, 17) <= datetime.fromtimestamp(os.path.getctime(fname)) <= datetime(2015, 3, 13)
]
This returns a list of all files between two dates...
I'm guessing you're using Python 2.x because otherwise datetime.date(2015,03,13) would be giving you a SyntaxError in 3.x. Be wary of that as 03 is an octal literal and just happens to work in your case - but 08/09 will break as they're invalid for octal.

Categories

Resources